diff --git a/CMakeLists.txt b/CMakeLists.txt index 18297834e..7a7197282 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -80,6 +80,7 @@ set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED}) set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) set(GGML_LLAMAFILE ON) +set(GGML_CUDA_USE_GRAPHS ON) # transition helpers function (llama_option_depr TYPE OLD NEW) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index f3763f7eb..0d0d52d57 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -109,6 +109,7 @@ option(GGML_LLAMAFILE "ggml: use ggml SGEMM" option(GGML_CUDA "ggml: use CUDA" OFF) option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF) option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) +option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF) set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels") set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels") option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF) @@ -119,6 +120,7 @@ set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF) option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) +option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF) option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF) option(GGML_HIPBLAS "ggml: use hipBLAS" OFF) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index ba341d374..d0f4097d8 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -295,12 +295,15 @@ if (GGML_CUDA) list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA) - add_compile_definitions(GGML_CUDA_USE_GRAPHS) add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y}) add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) + if (GGML_CUDA_USE_GRAPHS) + add_compile_definitions(GGML_CUDA_USE_GRAPHS) + endif() + if (GGML_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV) endif()