From bbe7c56c9993af86aa2d84cbe1fd69e1b4300cea Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Fri, 26 Jan 2024 15:34:06 -0500 Subject: [PATCH] cmake : pass CPU architecture flags to nvcc (#5146) --- CMakeLists.txt | 74 ++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index af3665129..2b2ae532e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -466,17 +466,17 @@ function(get_flags CCID CCVER) (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0) ) - set(C_FLAGS ${C_FLAGS} -Wdouble-promotion) + list(APPEND C_FLAGS -Wdouble-promotion) endif() elseif (CCID STREQUAL "GNU") set(C_FLAGS -Wdouble-promotion) set(CXX_FLAGS -Wno-array-bounds) if (CCVER VERSION_GREATER_EQUAL 7.1.0) - set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation) + list(APPEND CXX_FLAGS -Wno-format-truncation) endif() if (CCVER VERSION_GREATER_EQUAL 8.1.0) - set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi) + list(APPEND CXX_FLAGS -Wextra-semi) endif() elseif (CCID MATCHES "Intel") # enable max optimization level when using Intel compiler @@ -510,16 +510,18 @@ if (LLAMA_ALL_WARNINGS) endif() endif() +set(CUDA_CXX_FLAGS "") + if (LLAMA_CUBLAS) set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math) if (NOT MSVC) - set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic) + list(APPEND CUDA_FLAGS -Wno-pedantic) endif() if (LLAMA_ALL_WARNINGS AND NOT MSVC) set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c) if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "") - set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER}) + list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER}) endif() execute_process( @@ -547,13 +549,8 @@ if (LLAMA_CUBLAS) message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") get_flags(${CUDA_CCID} ${CUDA_CCVER}) - list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS) # pass host compiler flags as a single argument - if (NOT CUDA_CXX_FLAGS STREQUAL "") - set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS}) - endif() + list(APPEND CUDA_CXX_FLAGS ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later endif() - - add_compile_options("$<$:${CUDA_FLAGS}>") endif() if (WIN32) @@ -618,12 +615,7 @@ if (NOT MSVC) endif() endif() -function(add_compile_option_cpp ARG) - # Adds a compile option to C/C++ only, but not for Cuda. - # Use, e.g., for CPU-architecture flags. - add_compile_options($<$:${ARG}>) - add_compile_options($<$:${ARG}>) -endfunction() +set(ARCH_FLAGS "") if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64")) message(STATUS "ARM detected") @@ -636,19 +628,19 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC else() check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") - add_compile_options(-mfp16-format=ieee) + list(APPEND ARCH_FLAGS -mfp16-format=ieee) endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") # Raspberry Pi 1, Zero - add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access) + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") # Raspberry Pi 2 - add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") # Raspberry Pi 3, 4, Zero 2 (32-bit) - add_compile_options(-mno-unaligned-access) + list(APPEND ARCH_FLAGS -mno-unaligned-access) endif() endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" ) @@ -659,7 +651,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE include(cmake/FindSIMD.cmake) endif () if (LLAMA_AVX512) - add_compile_option_cpp(/arch:AVX512) + list(APPEND ARCH_FLAGS /arch:AVX512) # MSVC has no compile-time flags enabling specific # AVX512 extensions, neither it defines the # macros corresponding to the extensions. @@ -673,49 +665,61 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE add_compile_definitions($<$:__AVX512VNNI__>) endif() elseif (LLAMA_AVX2) - add_compile_option_cpp(/arch:AVX2) + list(APPEND ARCH_FLAGS /arch:AVX2) elseif (LLAMA_AVX) - add_compile_option_cpp(/arch:AVX) + list(APPEND ARCH_FLAGS /arch:AVX) endif() else() if (LLAMA_NATIVE) - add_compile_option_cpp(-march=native) + list(APPEND ARCH_FLAGS -march=native) endif() if (LLAMA_F16C) - add_compile_option_cpp(-mf16c) + list(APPEND ARCH_FLAGS -mf16c) endif() if (LLAMA_FMA) - add_compile_option_cpp(-mfma) + list(APPEND ARCH_FLAGS -mfma) endif() if (LLAMA_AVX) - add_compile_option_cpp(-mavx) + list(APPEND ARCH_FLAGS -mavx) endif() if (LLAMA_AVX2) - add_compile_option_cpp(-mavx2) + list(APPEND ARCH_FLAGS -mavx2) endif() if (LLAMA_AVX512) - add_compile_option_cpp(-mavx512f) - add_compile_option_cpp(-mavx512bw) + list(APPEND ARCH_FLAGS -mavx512f) + list(APPEND ARCH_FLAGS -mavx512bw) endif() if (LLAMA_AVX512_VBMI) - add_compile_option_cpp(-mavx512vbmi) + list(APPEND ARCH_FLAGS -mavx512vbmi) endif() if (LLAMA_AVX512_VNNI) - add_compile_option_cpp(-mavx512vnni) + list(APPEND ARCH_FLAGS -mavx512vnni) endif() endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") message(STATUS "PowerPC detected") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") - add_compile_options(-mcpu=powerpc64le) + list(APPEND ARCH_FLAGS -mcpu=powerpc64le) else() - add_compile_options(-mcpu=native -mtune=native) + list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) endif() else() message(STATUS "Unknown architecture") endif() +add_compile_options("$<$:${ARCH_FLAGS}>") +add_compile_options("$<$:${ARCH_FLAGS}>") + +if (LLAMA_CUBLAS) + list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS}) + list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument + if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") + list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED}) + endif() + add_compile_options("$<$:${CUDA_FLAGS}>") +endif() + if (MINGW) # Target Windows 8 for PrefetchVirtualMemory add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})