From 467576b6cc7d2b9220f55bc635aa51469cf26fb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sun, 17 Nov 2024 09:06:34 +0100
Subject: [PATCH] CMake: default to -arch=native for CUDA build (#10320)

---
 README.md                         |  4 ++--
 ggml/src/ggml-cuda/CMakeLists.txt | 15 +++++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 6ab6acf12..5f7933c13 100644
--- a/README.md
+++ b/README.md
@@ -459,14 +459,14 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
 - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
 
-## Other documentations
+## Other documentation
 
 - [main (cli)](./examples/main/README.md)
 - [server](./examples/server/README.md)
 - [jeopardy](./examples/jeopardy/README.md)
 - [GBNF grammars](./grammars/README.md)
 
-**Development documentations**
+**Development documentation**
 
 - [How to build](./docs/build.md)
 - [Running on Docker](./docs/docker.md)
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
index 40ed2bdf3..860552f3a 100644
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -6,15 +6,18 @@ if (CUDAToolkit_FOUND)
     message(STATUS "CUDA Toolkit found")
 
     if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        # 52 == lowest CUDA 12 standard
-        # 60 == FP16 CUDA intrinsics
-        # 61 == integer CUDA intrinsics
-        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
-        if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
+        # native == GPUs available at build time
+        # 52     == Maxwell, lowest CUDA 12 standard
+        # 60     == P100, FP16 CUDA intrinsics
+        # 61     == Pascal, __dp4a instruction (per-byte integer dot product)
+        # 70     == V100, FP16 tensor cores
+        # 75     == Turing, int6 tensor cores
+        if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6")
+            set(CMAKE_CUDA_ARCHITECTURES "native")
+        elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
             set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
         else()
             set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
-            #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
         endif()
     endif()
     message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")