From d0e3ce51f45bd6a646da1952d7e5d143a087db3e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 19 Feb 2024 14:45:41 +0200 Subject: [PATCH] ci : enable -Werror for CUDA builds (#5579) * cmake : pass -Werror through -Xcompiler ggml-ci * make, cmake : enable CUDA errors on warnings ggml-ci --- CMakeLists.txt | 33 +++++++++++++++++++-------------- Makefile | 5 ++++- ggml-cuda.cu | 50 ++++++++++++++++++++++++++------------------------ 3 files changed, 49 insertions(+), 39 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 40a098d01..168b133f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -145,14 +145,6 @@ set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) include(CheckCXXCompilerFlag) -if (LLAMA_FATAL_WARNINGS) - if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") - add_compile_options(-Werror) - elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - add_compile_options(/WX) - endif() -endif() - # enable libstdc++ assertions for debug builds if (CMAKE_SYSTEM_NAME MATCHES "Linux") add_compile_definitions($<$:_GLIBCXX_ASSERTIONS>) @@ -747,15 +739,24 @@ function(get_flags CCID CCVER) set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE) endfunction() +if (LLAMA_FATAL_WARNINGS) + if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + list(APPEND C_FLAGS -Werror) + list(APPEND CXX_FLAGS -Werror) + elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + add_compile_options(/WX) + endif() +endif() + if (LLAMA_ALL_WARNINGS) if (NOT MSVC) - set(WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) - set(C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes - -Werror=implicit-int -Werror=implicit-function-declaration) - set(CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) + list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) + list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes + -Werror=implicit-int -Werror=implicit-function-declaration) + list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) - set(C_FLAGS ${WARNING_FLAGS} ${C_FLAGS}) - set(CXX_FLAGS ${WARNING_FLAGS} ${CXX_FLAGS}) + list(APPEND C_FLAGS ${WARNING_FLAGS}) + list(APPEND CXX_FLAGS ${WARNING_FLAGS}) get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) @@ -773,6 +774,10 @@ set(CUDA_CXX_FLAGS "") if (LLAMA_CUBLAS) set(CUDA_FLAGS -use_fast_math) + if (LLAMA_FATAL_WARNINGS) + list(APPEND CUDA_FLAGS -Werror all-warnings) + endif() + if (LLAMA_ALL_WARNINGS AND NOT MSVC) set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c) if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "") diff --git a/Makefile b/Makefile index 29fd2ca9c..63b4af9ba 100644 --- a/Makefile +++ b/Makefile @@ -217,7 +217,7 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn ifeq ($(LLAMA_FATAL_WARNINGS),1) - MK_CFLAGS += -Werror + MK_CFLAGS += -Werror MK_CXXFLAGS += -Werror endif @@ -385,6 +385,9 @@ ifdef LLAMA_CUBLAS MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib OBJS += ggml-cuda.o MK_NVCCFLAGS += -use_fast_math +ifdef LLAMA_FATAL_WARNINGS + MK_NVCCFLAGS += -Werror all-warnings +endif # LLAMA_FATAL_WARNINGS ifndef JETSON_EOL_MODULE_DETECT MK_NVCCFLAGS += --forward-unknown-to-host-compiler endif # JETSON_EOL_MODULE_DETECT diff --git a/ggml-cuda.cu b/ggml-cuda.cu index eef213509..e091dbdc1 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -651,18 +651,18 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { return a; } -static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); - } - return a; -#else - (void) a; - NO_DEVICE_CODE; -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL -} +//static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { +//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +//#pragma unroll +// for (int mask = 16; mask > 0; mask >>= 1) { +// a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); +// } +// return a; +//#else +// (void) a; +// NO_DEVICE_CODE; +//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +//} static __device__ __forceinline__ float warp_reduce_max(float x) { #pragma unroll @@ -672,18 +672,18 @@ static __device__ __forceinline__ float warp_reduce_max(float x) { return x; } -static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); - } - return x; -#else - (void) x; - NO_DEVICE_CODE; -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX -} +//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { +//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX +//#pragma unroll +// for (int mask = 16; mask > 0; mask >>= 1) { +// x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); +// } +// return x; +//#else +// (void) x; +// NO_DEVICE_CODE; +//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX +//} static __device__ __forceinline__ float op_repeat(const float a, const float b) { return b; @@ -4641,10 +4641,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f; return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); #else + (void) ksigns64; assert(false); return 0.f; #endif #else + (void) ksigns64; assert(false); return 0.f; #endif