mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 03:31:46 +00:00
add hip
This commit is contained in:
parent
710822f32e
commit
c8da7d0f70
@ -173,7 +173,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
|||||||
(cmakeBool "GGML_NATIVE" false)
|
(cmakeBool "GGML_NATIVE" false)
|
||||||
(cmakeBool "GGML_BLAS" useBlas)
|
(cmakeBool "GGML_BLAS" useBlas)
|
||||||
(cmakeBool "GGML_CUDA" useCuda)
|
(cmakeBool "GGML_CUDA" useCuda)
|
||||||
(cmakeBool "GGML_HIPBLAS" useRocm)
|
(cmakeBool "GGML_HIP" useRocm)
|
||||||
(cmakeBool "GGML_METAL" useMetalKit)
|
(cmakeBool "GGML_METAL" useMetalKit)
|
||||||
(cmakeBool "GGML_VULKAN" useVulkan)
|
(cmakeBool "GGML_VULKAN" useVulkan)
|
||||||
(cmakeBool "GGML_STATIC" enableStatic)
|
(cmakeBool "GGML_STATIC" enableStatic)
|
||||||
|
8
.github/workflows/build.yml
vendored
8
.github/workflows/build.yml
vendored
@ -405,13 +405,13 @@ jobs:
|
|||||||
- name: Build with native CMake HIP support
|
- name: Build with native CMake HIP support
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON
|
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON
|
||||||
cmake --build build --config Release -j $(nproc)
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Build with legacy HIP support
|
- name: Build with legacy HIP support
|
||||||
id: cmake_build_legacy_hip
|
id: cmake_build_legacy_hip
|
||||||
run: |
|
run: |
|
||||||
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
|
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
|
||||||
cmake --build build2 --config Release -j $(nproc)
|
cmake --build build2 --config Release -j $(nproc)
|
||||||
|
|
||||||
ubuntu-22-cmake-sycl:
|
ubuntu-22-cmake-sycl:
|
||||||
@ -1014,7 +1014,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||||
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
|
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
|
||||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
|
||||||
windows-latest-cmake-hip-release:
|
windows-latest-cmake-hip-release:
|
||||||
@ -1050,7 +1050,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||||
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
|
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
|
||||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
md "build\bin\rocblas\library\"
|
md "build\bin\rocblas\library\"
|
||||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
||||||
|
2
Makefile
2
Makefile
@ -819,7 +819,7 @@ ifdef GGML_HIPBLAS
|
|||||||
GGML_CUDA_MMV_Y ?= 1
|
GGML_CUDA_MMV_Y ?= 1
|
||||||
GGML_CUDA_KQUANTS_ITER ?= 2
|
GGML_CUDA_KQUANTS_ITER ?= 2
|
||||||
|
|
||||||
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
|
MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
|
||||||
|
|
||||||
ifdef GGML_HIP_UMA
|
ifdef GGML_HIP_UMA
|
||||||
MK_CPPFLAGS += -DGGML_HIP_UMA
|
MK_CPPFLAGS += -DGGML_HIP_UMA
|
||||||
|
@ -6,7 +6,7 @@ set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
|||||||
set(GGML_BLAS @GGML_BLAS@)
|
set(GGML_BLAS @GGML_BLAS@)
|
||||||
set(GGML_CUDA @GGML_CUDA@)
|
set(GGML_CUDA @GGML_CUDA@)
|
||||||
set(GGML_METAL @GGML_METAL@)
|
set(GGML_METAL @GGML_METAL@)
|
||||||
set(GGML_HIPBLAS @GGML_HIPBLAS@)
|
set(GGML_HIP @GGML_HIPS@)
|
||||||
set(GGML_ACCELERATE @GGML_ACCELERATE@)
|
set(GGML_ACCELERATE @GGML_ACCELERATE@)
|
||||||
set(GGML_VULKAN @GGML_VULKAN@)
|
set(GGML_VULKAN @GGML_VULKAN@)
|
||||||
set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
|
set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
|
||||||
|
@ -230,7 +230,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|||||||
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
||||||
```bash
|
```bash
|
||||||
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||||
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||||
&& cmake --build build --config Release -- -j 16
|
&& cmake --build build --config Release -- -j 16
|
||||||
```
|
```
|
||||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
||||||
@ -247,7 +247,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|||||||
```bash
|
```bash
|
||||||
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
|
||||||
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
|
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
|
||||||
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||||
&& cmake --build build -- -j 16
|
&& cmake --build build -- -j 16
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -259,7 +259,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|||||||
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
||||||
```bash
|
```bash
|
||||||
set PATH=%HIP_PATH%\bin;%PATH%
|
set PATH=%HIP_PATH%\bin;%PATH%
|
||||||
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
|
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
|
||||||
cmake --build build
|
cmake --build build
|
||||||
```
|
```
|
||||||
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
||||||
|
@ -116,6 +116,7 @@ endif()
|
|||||||
|
|
||||||
# ggml core
|
# ggml core
|
||||||
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
|
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
|
||||||
|
option(GGML_CPU "ggml: enable CPU backend" ON)
|
||||||
|
|
||||||
# 3rd party libs / backends
|
# 3rd party libs / backends
|
||||||
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
|
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
|
||||||
@ -141,7 +142,7 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
|
|||||||
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
||||||
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
||||||
|
|
||||||
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
|
option(GGML_HIP "ggml: use HIP" OFF)
|
||||||
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
||||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_HIPBLAS
|
#ifdef GGML_USE_HIP
|
||||||
#define GGML_CUDA_NAME "ROCm"
|
#define GGML_CUDA_NAME "ROCm"
|
||||||
#define GGML_CUBLAS_NAME "hipBLAS"
|
#define GGML_CUBLAS_NAME "hipBLAS"
|
||||||
#elif defined(GGML_USE_MUSA)
|
#elif defined(GGML_USE_MUSA)
|
||||||
|
@ -29,115 +29,6 @@ endif()
|
|||||||
unset(GGML_EXTRA_LIBS_PRIVATE)
|
unset(GGML_EXTRA_LIBS_PRIVATE)
|
||||||
unset(GGML_EXTRA_LIBS_PUBLIC)
|
unset(GGML_EXTRA_LIBS_PUBLIC)
|
||||||
|
|
||||||
if (GGML_HIPBLAS)
|
|
||||||
if (NOT EXISTS $ENV{ROCM_PATH})
|
|
||||||
if (NOT EXISTS /opt/rocm)
|
|
||||||
set(ROCM_PATH /usr)
|
|
||||||
else()
|
|
||||||
set(ROCM_PATH /opt/rocm)
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
set(ROCM_PATH $ENV{ROCM_PATH})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
|
|
||||||
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
|
|
||||||
|
|
||||||
# CMake on Windows doesn't support the HIP language yet
|
|
||||||
if (WIN32)
|
|
||||||
set(CXX_IS_HIPCC TRUE)
|
|
||||||
else()
|
|
||||||
string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (CXX_IS_HIPCC)
|
|
||||||
if (LINUX)
|
|
||||||
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
|
||||||
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
|
|
||||||
" Prefer setting the HIP compiler directly. See README for details.")
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
# Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
|
|
||||||
if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
|
|
||||||
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
|
|
||||||
endif()
|
|
||||||
cmake_minimum_required(VERSION 3.21)
|
|
||||||
enable_language(HIP)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
find_package(hip REQUIRED)
|
|
||||||
find_package(hipblas REQUIRED)
|
|
||||||
find_package(rocblas REQUIRED)
|
|
||||||
|
|
||||||
message(STATUS "HIP and hipBLAS found")
|
|
||||||
|
|
||||||
file(GLOB GGML_HEADERS_ROCM "ggml-cuda/*.cuh")
|
|
||||||
list(APPEND GGML_HEADERS_ROCM "../include/ggml-cuda.h")
|
|
||||||
|
|
||||||
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
|
|
||||||
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
|
|
||||||
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
|
|
||||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
|
||||||
file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
|
|
||||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
|
||||||
|
|
||||||
if (GGML_CUDA_FA_ALL_QUANTS)
|
|
||||||
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
|
|
||||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
|
||||||
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
|
|
||||||
else()
|
|
||||||
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
|
|
||||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
|
||||||
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
|
|
||||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
|
||||||
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
|
|
||||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA)
|
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_HIPBLAS)
|
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
|
|
||||||
add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
|
|
||||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
|
|
||||||
|
|
||||||
if (GGML_HIP_UMA)
|
|
||||||
add_compile_definitions(GGML_HIP_UMA)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (GGML_CUDA_FORCE_DMMV)
|
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (GGML_CUDA_FORCE_MMQ)
|
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (GGML_CUDA_FORCE_CUBLAS)
|
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (GGML_CUDA_NO_PEER_COPY)
|
|
||||||
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (CXX_IS_HIPCC)
|
|
||||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
|
||||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE hip::device)
|
|
||||||
else()
|
|
||||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (GGML_STATIC)
|
|
||||||
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
list(APPEND GGML_EXTRA_LIBS_PUBLIC hip::host roc::rocblas roc::hipblas)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
function(get_flags CCID CCVER)
|
function(get_flags CCID CCVER)
|
||||||
set(C_FLAGS "")
|
set(C_FLAGS "")
|
||||||
set(CXX_FLAGS "")
|
set(CXX_FLAGS "")
|
||||||
@ -354,12 +245,12 @@ function(ggml_add_backend backend)
|
|||||||
endif()
|
endif()
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
set(GGML_CPU ON)
|
|
||||||
ggml_add_backend(CPU)
|
ggml_add_backend(CPU)
|
||||||
ggml_add_backend(AMX)
|
ggml_add_backend(AMX)
|
||||||
ggml_add_backend(BLAS)
|
ggml_add_backend(BLAS)
|
||||||
ggml_add_backend(CANN)
|
ggml_add_backend(CANN)
|
||||||
ggml_add_backend(CUDA)
|
ggml_add_backend(CUDA)
|
||||||
|
ggml_add_backend(HIP)
|
||||||
ggml_add_backend(Kompute)
|
ggml_add_backend(Kompute)
|
||||||
ggml_add_backend(METAL)
|
ggml_add_backend(METAL)
|
||||||
ggml_add_backend(RPC)
|
ggml_add_backend(RPC)
|
||||||
|
@ -1,10 +1,4 @@
|
|||||||
if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
|
if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
|
||||||
else()
|
|
||||||
set(GGML_AMX OFF PARENT_SCOPE)
|
|
||||||
message(WARNING "AMX requires gcc version > 11.0. Turning off GGML_AMX.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (GGML_AMX)
|
|
||||||
message(STATUS "Using AMX")
|
message(STATUS "Using AMX")
|
||||||
|
|
||||||
file(GLOB GGML_HEADERS_AMX "*.h")
|
file(GLOB GGML_HEADERS_AMX "*.h")
|
||||||
@ -104,4 +98,7 @@ if (GGML_AMX)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
|
target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
|
||||||
|
else()
|
||||||
|
set(GGML_AMX OFF PARENT_SCOPE)
|
||||||
|
message(WARNING "AMX requires gcc version > 11.0. Turning off GGML_AMX.")
|
||||||
endif()
|
endif()
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIP)
|
||||||
#define GGML_COMMON_DECL_HIP
|
#define GGML_COMMON_DECL_HIP
|
||||||
#define GGML_COMMON_IMPL_HIP
|
#define GGML_COMMON_IMPL_HIP
|
||||||
#else
|
#else
|
||||||
@ -26,13 +26,13 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIP)
|
||||||
#include "vendors/hip.h"
|
#include "vendors/hip.h"
|
||||||
#elif defined(GGML_USE_MUSA)
|
#elif defined(GGML_USE_MUSA)
|
||||||
#include "vendors/musa.h"
|
#include "vendors/musa.h"
|
||||||
#else
|
#else
|
||||||
#include "vendors/cuda.h"
|
#include "vendors/cuda.h"
|
||||||
#endif // defined(GGML_USE_HIPBLAS)
|
#endif // defined(GGML_USE_HIP)
|
||||||
|
|
||||||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||||
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
||||||
@ -97,7 +97,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
|||||||
|
|
||||||
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIPBLAS)
|
#if !defined(GGML_USE_HIP)
|
||||||
static const char * cu_get_error_str(CUresult err) {
|
static const char * cu_get_error_str(CUresult err) {
|
||||||
const char * err_str;
|
const char * err_str;
|
||||||
cuGetErrorString(err, &err_str);
|
cuGetErrorString(err, &err_str);
|
||||||
@ -120,21 +120,21 @@ typedef float dfloat; // dequantize float
|
|||||||
typedef float2 dfloat2;
|
typedef float2 dfloat2;
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
|
|
||||||
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||||
#define FP16_AVAILABLE
|
#define FP16_AVAILABLE
|
||||||
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||||
|
|
||||||
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
||||||
#define FAST_FP16_AVAILABLE
|
#define FAST_FP16_AVAILABLE
|
||||||
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
||||||
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||||
#define FP16_MMA_AVAILABLE
|
#define FP16_MMA_AVAILABLE
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||||
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||||
#define INT8_MMA_AVAILABLE
|
#define INT8_MMA_AVAILABLE
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||||
|
|
||||||
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
|
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
|
||||||
#define FLASH_ATTN_AVAILABLE
|
#define FLASH_ATTN_AVAILABLE
|
||||||
@ -156,14 +156,14 @@ static constexpr bool int8_mma_available(const int cc) {
|
|||||||
static __device__ void no_device_code(
|
static __device__ void no_device_code(
|
||||||
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
||||||
file_name, line, function_name, arch);
|
file_name, line, function_name, arch);
|
||||||
GGML_UNUSED(arch_list);
|
GGML_UNUSED(arch_list);
|
||||||
#else
|
#else
|
||||||
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
||||||
file_name, line, function_name, arch, arch_list);
|
file_name, line, function_name, arch, arch_list);
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
__trap();
|
__trap();
|
||||||
|
|
||||||
GGML_UNUSED(no_device_code); // suppress unused function warning
|
GGML_UNUSED(no_device_code); // suppress unused function warning
|
||||||
@ -176,7 +176,7 @@ static __device__ void no_device_code(
|
|||||||
#endif // __CUDA_ARCH__
|
#endif // __CUDA_ARCH__
|
||||||
|
|
||||||
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
||||||
return __reduce_add_sync(0xffffffff, x);
|
return __reduce_add_sync(0xffffffff, x);
|
||||||
#else
|
#else
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
@ -184,7 +184,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
|||||||
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
||||||
}
|
}
|
||||||
return x;
|
return x;
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
||||||
@ -207,7 +207,7 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|||||||
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
||||||
#ifdef FP16_AVAILABLE
|
#ifdef FP16_AVAILABLE
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
|
const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
|
||||||
@ -221,7 +221,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
|||||||
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
||||||
}
|
}
|
||||||
return a;
|
return a;
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
@ -240,11 +240,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|||||||
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
||||||
#ifdef FP16_AVAILABLE
|
#ifdef FP16_AVAILABLE
|
||||||
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||||
return __float2half(fmaxf(__half2float(a), __half2float(b)));
|
return __float2half(fmaxf(__half2float(a), __half2float(b)));
|
||||||
#else
|
#else
|
||||||
return __hmax(a, b);
|
return __hmax(a, b);
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||||
|
|
||||||
#else
|
#else
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
@ -254,7 +254,7 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
|
|||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
|
||||||
#if CUDART_VERSION >= CUDART_HMAX
|
#if CUDART_VERSION >= CUDART_HMAX
|
||||||
return __hmax2(a, b);
|
return __hmax2(a, b);
|
||||||
@ -269,11 +269,11 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
|
|||||||
GGML_UNUSED(a);
|
GGML_UNUSED(a);
|
||||||
GGML_UNUSED(b);
|
GGML_UNUSED(b);
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
||||||
@ -282,7 +282,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
|||||||
#else
|
#else
|
||||||
GGML_UNUSED(x);
|
GGML_UNUSED(x);
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||||
}
|
}
|
||||||
|
|
||||||
#if CUDART_VERSION < CUDART_HMASK
|
#if CUDART_VERSION < CUDART_HMASK
|
||||||
@ -294,7 +294,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
|||||||
#endif // CUDART_VERSION < CUDART_HMASK
|
#endif // CUDART_VERSION < CUDART_HMASK
|
||||||
|
|
||||||
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
|
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
|
||||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||||
#elif defined(RDNA3)
|
#elif defined(RDNA3)
|
||||||
@ -320,7 +320,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
|||||||
#endif
|
#endif
|
||||||
return c;
|
return c;
|
||||||
|
|
||||||
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
return __dp4a(a, b, c);
|
return __dp4a(a, b, c);
|
||||||
@ -330,7 +330,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
|||||||
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: move to ggml-common.h
|
// TODO: move to ggml-common.h
|
||||||
|
@ -517,9 +517,9 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<int D, int parallel_blocks> // D == head size
|
template<int D, int parallel_blocks> // D == head size
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(D, 1)
|
__launch_bounds__(D, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_combine_results(
|
static __global__ void flash_attn_combine_results(
|
||||||
const float * __restrict__ VKQ_parts,
|
const float * __restrict__ VKQ_parts,
|
||||||
const float2 * __restrict__ VKQ_meta,
|
const float2 * __restrict__ VKQ_meta,
|
||||||
|
@ -5,9 +5,9 @@
|
|||||||
#define FATTN_KQ_STRIDE_TILE_F16 64
|
#define FATTN_KQ_STRIDE_TILE_F16 64
|
||||||
|
|
||||||
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
|
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_tile_ext_f16(
|
static __global__ void flash_attn_tile_ext_f16(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
|
@ -5,9 +5,9 @@
|
|||||||
#define FATTN_KQ_STRIDE_TILE_F32 32
|
#define FATTN_KQ_STRIDE_TILE_F32 32
|
||||||
|
|
||||||
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
|
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_tile_ext_f32(
|
static __global__ void flash_attn_tile_ext_f32(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
|
@ -2,9 +2,9 @@
|
|||||||
#include "fattn-common.cuh"
|
#include "fattn-common.cuh"
|
||||||
|
|
||||||
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(D, 1)
|
__launch_bounds__(D, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_vec_ext_f16(
|
static __global__ void flash_attn_vec_ext_f16(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
|
@ -2,9 +2,9 @@
|
|||||||
#include "fattn-common.cuh"
|
#include "fattn-common.cuh"
|
||||||
|
|
||||||
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(D, 1)
|
__launch_bounds__(D, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_vec_ext_f32(
|
static __global__ void flash_attn_vec_ext_f32(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
|
@ -7,9 +7,9 @@
|
|||||||
|
|
||||||
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
||||||
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
|
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_ext_f16(
|
static __global__ void flash_attn_ext_f16(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
|
@ -91,7 +91,7 @@ int ggml_cuda_get_device() {
|
|||||||
|
|
||||||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||||
ggml_cuda_set_device(device);
|
ggml_cuda_set_device(device);
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
|
#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
|
||||||
auto res = hipMallocManaged(ptr, size);
|
auto res = hipMallocManaged(ptr, size);
|
||||||
if (res == hipSuccess) {
|
if (res == hipSuccess) {
|
||||||
// if error we "need" to know why...
|
// if error we "need" to know why...
|
||||||
@ -100,7 +100,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
|||||||
return res;
|
return res;
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIPBLAS)
|
#if !defined(GGML_USE_HIP)
|
||||||
cudaError_t err;
|
cudaError_t err;
|
||||||
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
||||||
{
|
{
|
||||||
@ -113,7 +113,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
|||||||
return err;
|
return err;
|
||||||
#else
|
#else
|
||||||
return cudaMalloc(ptr, size);
|
return cudaMalloc(ptr, size);
|
||||||
#endif // !defined(GGML_USE_HIPBLAS)
|
#endif // !defined(GGML_USE_HIP)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -151,7 +151,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
int device_vmm = 0;
|
int device_vmm = 0;
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||||
CUdevice device;
|
CUdevice device;
|
||||||
CU_CHECK(cuDeviceGet(&device, id));
|
CU_CHECK(cuDeviceGet(&device, id));
|
||||||
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
||||||
@ -163,7 +163,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|||||||
alloc_prop.location.id = id;
|
alloc_prop.location.id = id;
|
||||||
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||||
}
|
}
|
||||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||||
info.devices[id].vmm = !!device_vmm;
|
info.devices[id].vmm = !!device_vmm;
|
||||||
|
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
@ -175,13 +175,13 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|||||||
|
|
||||||
info.devices[id].nsm = prop.multiProcessorCount;
|
info.devices[id].nsm = prop.multiProcessorCount;
|
||||||
info.devices[id].smpb = prop.sharedMemPerBlock;
|
info.devices[id].smpb = prop.sharedMemPerBlock;
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
||||||
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
||||||
#else
|
#else
|
||||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
@ -299,7 +299,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// pool with virtual memory
|
// pool with virtual memory
|
||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||||
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||||
|
|
||||||
@ -393,14 +393,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
|||||||
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
|
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||||
|
|
||||||
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||||
if (ggml_cuda_info().devices[device].vmm) {
|
if (ggml_cuda_info().devices[device].vmm) {
|
||||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
||||||
}
|
}
|
||||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1325,7 +1325,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
|||||||
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
|
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
|
||||||
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
|
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||||
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
||||||
cudaMemcpy3DPeerParms p = {};
|
cudaMemcpy3DPeerParms p = {};
|
||||||
p.dstDevice = dstDevice;
|
p.dstDevice = dstDevice;
|
||||||
@ -1339,7 +1339,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
|
|||||||
GGML_UNUSED(dstDevice);
|
GGML_UNUSED(dstDevice);
|
||||||
GGML_UNUSED(srcDevice);
|
GGML_UNUSED(srcDevice);
|
||||||
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
|
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
|
||||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_op_mul_mat(
|
static void ggml_cuda_op_mul_mat(
|
||||||
|
@ -100,9 +100,9 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
|||||||
return 128;
|
return 128;
|
||||||
#else // INT8_MMA_AVAILABLE
|
#else // INT8_MMA_AVAILABLE
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
return 128;
|
return 128;
|
||||||
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||||
#ifdef GGML_CUDA_FORCE_MMQ
|
#ifdef GGML_CUDA_FORCE_MMQ
|
||||||
@ -115,7 +115,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
|||||||
return 64;
|
return 64;
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
#endif // INT8_MMA_AVAILABLE
|
#endif // INT8_MMA_AVAILABLE
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -124,7 +124,7 @@ static constexpr int get_mmq_y_host(const int cc) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static constexpr __device__ int get_mmq_y_device() {
|
static constexpr __device__ int get_mmq_y_device() {
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
#if defined(RDNA1)
|
#if defined(RDNA1)
|
||||||
return 64;
|
return 64;
|
||||||
#else
|
#else
|
||||||
@ -136,7 +136,7 @@ static constexpr __device__ int get_mmq_y_device() {
|
|||||||
#else
|
#else
|
||||||
return 64;
|
return 64;
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
}
|
}
|
||||||
|
|
||||||
#define MMQ_DP4A_TXS_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0}
|
#define MMQ_DP4A_TXS_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0}
|
||||||
@ -2569,7 +2569,7 @@ static __device__ void mul_mat_q_process_tile(
|
|||||||
// The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
|
// The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
|
||||||
|
|
||||||
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
#if defined(RDNA3) || defined(RDNA2)
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||||
#endif // defined(RDNA3) || defined(RDNA2)
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
@ -2579,7 +2579,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
|||||||
#else
|
#else
|
||||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||||
static __global__ void mul_mat_q(
|
static __global__ void mul_mat_q(
|
||||||
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||||
const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
|
const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
|
||||||
@ -2594,7 +2594,7 @@ static __global__ void mul_mat_q(
|
|||||||
constexpr int mmq_y = get_mmq_y_device();
|
constexpr int mmq_y = get_mmq_y_device();
|
||||||
|
|
||||||
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
||||||
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
||||||
{
|
{
|
||||||
constexpr bool fixup = false;
|
constexpr bool fixup = false;
|
||||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||||
@ -2602,7 +2602,7 @@ static __global__ void mul_mat_q(
|
|||||||
blockIdx.x, blockIdx.y, 0, ne00/qk);
|
blockIdx.x, blockIdx.y, 0, ne00/qk);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
||||||
|
|
||||||
const int64_t blocks_per_ne00 = ne00 / qk;
|
const int64_t blocks_per_ne00 = ne00 / qk;
|
||||||
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
|
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
|
||||||
@ -2765,14 +2765,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
|||||||
|
|
||||||
const int shmem = mmq_get_shmem<type>(mmq_x, mmq_y, cc);
|
const int shmem = mmq_get_shmem<type>(mmq_x, mmq_y, cc);
|
||||||
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||||
if (!shmem_limit_raised[id]) {
|
if (!shmem_limit_raised[id]) {
|
||||||
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
|
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
|
||||||
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
|
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
|
||||||
shmem_limit_raised[id] = true;
|
shmem_limit_raised[id] = true;
|
||||||
}
|
}
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
|
||||||
const int nty = (args.ne01 + mmq_y - 1) / mmq_y;
|
const int nty = (args.ne01 + mmq_y - 1) / mmq_y;
|
||||||
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
|
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
|
||||||
|
@ -48,10 +48,10 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <ggml_type type, int ncols_y>
|
template <ggml_type type, int ncols_y>
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
||||||
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
|
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void mul_mat_vec_q(
|
static __global__ void mul_mat_vec_q(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
|
||||||
@ -62,13 +62,13 @@ static __global__ void mul_mat_vec_q(
|
|||||||
|
|
||||||
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
|
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
|
||||||
constexpr int nwarps = 1;
|
constexpr int nwarps = 1;
|
||||||
constexpr int rows_per_cuda_block = 1;
|
constexpr int rows_per_cuda_block = 1;
|
||||||
#else
|
#else
|
||||||
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
|
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
|
||||||
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
|
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
|
||||||
|
|
||||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||||
const int row0 = rows_per_cuda_block*blockIdx.x;
|
const int row0 = rows_per_cuda_block*blockIdx.x;
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
||||||
#define USE_CUB
|
#define USE_CUB
|
||||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
||||||
|
|
||||||
#ifdef USE_CUB
|
#ifdef USE_CUB
|
||||||
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
|
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
|
||||||
|
113
ggml/src/ggml-hip/CMakeLists.txt
Normal file
113
ggml/src/ggml-hip/CMakeLists.txt
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
if (NOT EXISTS $ENV{ROCM_PATH})
|
||||||
|
if (NOT EXISTS /opt/rocm)
|
||||||
|
set(ROCM_PATH /usr)
|
||||||
|
else()
|
||||||
|
set(ROCM_PATH /opt/rocm)
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
set(ROCM_PATH $ENV{ROCM_PATH})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
|
||||||
|
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
|
||||||
|
|
||||||
|
# CMake on Windows doesn't support the HIP language yet
|
||||||
|
if (WIN32)
|
||||||
|
set(CXX_IS_HIPCC TRUE)
|
||||||
|
else()
|
||||||
|
string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (CXX_IS_HIPCC)
|
||||||
|
if (LINUX)
|
||||||
|
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||||
|
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
|
||||||
|
" Prefer setting the HIP compiler directly. See README for details.")
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
# Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
|
||||||
|
if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
|
||||||
|
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
|
||||||
|
endif()
|
||||||
|
cmake_minimum_required(VERSION 3.21)
|
||||||
|
enable_language(HIP)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
find_package(hip REQUIRED)
|
||||||
|
find_package(hipblas REQUIRED)
|
||||||
|
find_package(rocblas REQUIRED)
|
||||||
|
|
||||||
|
message(STATUS "HIP and hipBLAS found")
|
||||||
|
|
||||||
|
file(GLOB GGML_HEADERS_ROCM "../ggml-cuda/*.cuh")
|
||||||
|
list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h")
|
||||||
|
|
||||||
|
file(GLOB GGML_SOURCES_ROCM "../ggml-cuda/*.cu")
|
||||||
|
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-wmma*.cu")
|
||||||
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||||
|
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
|
||||||
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||||
|
|
||||||
|
if (GGML_CUDA_FA_ALL_QUANTS)
|
||||||
|
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
|
||||||
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||||
|
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
|
||||||
|
else()
|
||||||
|
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
|
||||||
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||||
|
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
|
||||||
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||||
|
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
|
||||||
|
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
add_library(ggml-hip
|
||||||
|
${GGML_HEADERS_ROCM}
|
||||||
|
${GGML_SOURCES_ROCM})
|
||||||
|
|
||||||
|
target_link_libraries(ggml-hip PRIVATE ggml-base)
|
||||||
|
target_include_directories(ggml-hip PRIVATE . ..)
|
||||||
|
|
||||||
|
# TODO: do not use CUDA definitions for HIP
|
||||||
|
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
|
||||||
|
|
||||||
|
add_compile_definitions(GGML_USE_HIP)
|
||||||
|
add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
|
||||||
|
add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
|
||||||
|
add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
|
||||||
|
|
||||||
|
if (GGML_HIP_UMA)
|
||||||
|
add_compile_definitions(GGML_HIP_UMA)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_CUDA_FORCE_DMMV)
|
||||||
|
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_CUDA_FORCE_MMQ)
|
||||||
|
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_CUDA_FORCE_CUBLAS)
|
||||||
|
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_CUDA_NO_PEER_COPY)
|
||||||
|
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (CXX_IS_HIPCC)
|
||||||
|
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
||||||
|
target_link_libraries(ggml-hip PRIVATE hip::device)
|
||||||
|
else()
|
||||||
|
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_STATIC)
|
||||||
|
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::rocblas roc::hipblas)
|
Loading…
Reference in New Issue
Block a user