mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 10:54:36 +00:00
CMake: default to -arch=native for CUDA build (#10320)
This commit is contained in:
parent
eda7e1d4f5
commit
467576b6cc
@ -459,14 +459,14 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
|
|||||||
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
|
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
|
||||||
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
|
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
|
||||||
|
|
||||||
## Other documentations
|
## Other documentation
|
||||||
|
|
||||||
- [main (cli)](./examples/main/README.md)
|
- [main (cli)](./examples/main/README.md)
|
||||||
- [server](./examples/server/README.md)
|
- [server](./examples/server/README.md)
|
||||||
- [jeopardy](./examples/jeopardy/README.md)
|
- [jeopardy](./examples/jeopardy/README.md)
|
||||||
- [GBNF grammars](./grammars/README.md)
|
- [GBNF grammars](./grammars/README.md)
|
||||||
|
|
||||||
**Development documentations**
|
**Development documentation**
|
||||||
|
|
||||||
- [How to build](./docs/build.md)
|
- [How to build](./docs/build.md)
|
||||||
- [Running on Docker](./docs/docker.md)
|
- [Running on Docker](./docs/docker.md)
|
||||||
|
@ -6,15 +6,18 @@ if (CUDAToolkit_FOUND)
|
|||||||
message(STATUS "CUDA Toolkit found")
|
message(STATUS "CUDA Toolkit found")
|
||||||
|
|
||||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||||
# 52 == lowest CUDA 12 standard
|
# native == GPUs available at build time
|
||||||
# 60 == FP16 CUDA intrinsics
|
# 52 == Maxwell, lowest CUDA 12 standard
|
||||||
# 61 == integer CUDA intrinsics
|
# 60 == P100, FP16 CUDA intrinsics
|
||||||
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
# 61 == Pascal, __dp4a instruction (per-byte integer dot product)
|
||||||
if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
# 70 == V100, FP16 tensor cores
|
||||||
|
# 75 == Turing, int6 tensor cores
|
||||||
|
if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6")
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES "native")
|
||||||
|
elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
|
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
|
||||||
else()
|
else()
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
|
set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
|
||||||
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
|
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||||
|
Loading…
Reference in New Issue
Block a user