mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-14 06:49:54 +00:00
f3f65429c4
* scripts : update sync [no ci] * files : relocate [no ci] * ci : disable kompute build [no ci] * cmake : fixes [no ci] * server : fix mingw build ggml-ci * cmake : minor [no ci] * cmake : link math library [no ci] * cmake : build normal ggml library (not object library) [no ci] * cmake : fix kompute build ggml-ci * make,cmake : fix LLAMA_CUDA + replace GGML_CDEF_PRIVATE ggml-ci * move public backend headers to the public include directory (#8122) * move public backend headers to the public include directory * nix test * spm : fix metal header --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * scripts : fix sync paths [no ci] * scripts : sync ggml-blas.h [no ci] --------- Co-authored-by: slaren <slarengh@gmail.com>
419 lines
11 KiB
Bash
419 lines
11 KiB
Bash
#!/bin/bash
|
|
#
|
|
# Helper script for deploying llama.cpp server with a single Bash command
|
|
#
|
|
# - Works on Linux and macOS
|
|
# - Supports: CPU, CUDA, Metal
|
|
# - Can run all GGUF models from HuggingFace
|
|
# - Can serve requests in parallel
|
|
# - Always builds latest llama.cpp from GitHub
|
|
#
|
|
# Limitations
|
|
#
|
|
# - Chat templates are poorly supported (base models recommended)
|
|
# - Might be unstable!
|
|
#
|
|
# Usage:
|
|
# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]
|
|
#
|
|
# --port: port number, default is 8888
|
|
# --repo: path to a repo containing GGUF model files
|
|
# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
|
|
# --backend: cpu, cuda, metal, depends on the OS
|
|
# --gpu-id: gpu id, default is 0
|
|
# --n-parallel: number of parallel requests, default is 8
|
|
# --n-kv: KV cache size, default is 4096
|
|
# --verbose: verbose output
|
|
# --non-interactive: run without asking a permission to run
|
|
#
|
|
# Example:
|
|
#
|
|
# bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
|
|
#
|
|
|
|
set -e
|
|
|
|
# required utils: curl, git, make
|
|
if ! command -v curl &> /dev/null; then
|
|
printf "[-] curl not found\n"
|
|
exit 1
|
|
fi
|
|
if ! command -v git &> /dev/null; then
|
|
printf "[-] git not found\n"
|
|
exit 1
|
|
fi
|
|
if ! command -v make &> /dev/null; then
|
|
printf "[-] make not found\n"
|
|
exit 1
|
|
fi
|
|
|
|
# parse arguments
|
|
is_interactive=1
|
|
port=8888
|
|
repo=""
|
|
wtype=""
|
|
backend="cpu"
|
|
|
|
# if macOS, use metal backend by default
|
|
if [[ "$OSTYPE" == "darwin"* ]]; then
|
|
backend="metal"
|
|
elif command -v nvcc &> /dev/null; then
|
|
backend="cuda"
|
|
fi
|
|
|
|
gpu_id=0
|
|
n_parallel=8
|
|
n_kv=4096
|
|
verbose=0
|
|
|
|
function print_usage {
|
|
printf "Usage:\n"
|
|
printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]\n\n"
|
|
printf " --port: port number, default is 8888\n"
|
|
printf " --repo: path to a repo containing GGUF model files\n"
|
|
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
|
|
printf " --backend: cpu, cuda, metal, depends on the OS\n"
|
|
printf " --gpu-id: gpu id, default is 0\n"
|
|
printf " --n-parallel: number of parallel requests, default is 8\n"
|
|
printf " --n-kv: KV cache size, default is 4096\n"
|
|
printf " --verbose: verbose output\n\n"
|
|
printf " --non-interactive: run without asking a permission to run\n"
|
|
printf "Example:\n\n"
|
|
printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
|
|
}
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
key="$1"
|
|
case $key in
|
|
--non-interactive)
|
|
is_interactive=0
|
|
shift
|
|
;;
|
|
--port)
|
|
port="$2"
|
|
shift
|
|
shift
|
|
;;
|
|
--repo)
|
|
repo="$2"
|
|
shift
|
|
shift
|
|
;;
|
|
--wtype)
|
|
wtype="$2"
|
|
shift
|
|
shift
|
|
;;
|
|
--backend)
|
|
backend="$2"
|
|
shift
|
|
shift
|
|
;;
|
|
--gpu-id)
|
|
gpu_id="$2"
|
|
shift
|
|
shift
|
|
;;
|
|
--n-parallel)
|
|
n_parallel="$2"
|
|
shift
|
|
shift
|
|
;;
|
|
--n-kv)
|
|
n_kv="$2"
|
|
shift
|
|
shift
|
|
;;
|
|
--verbose)
|
|
verbose=1
|
|
shift
|
|
;;
|
|
--help)
|
|
print_usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "Unknown argument: $key"
|
|
print_usage
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# available weights types
|
|
wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
|
|
|
|
wfiles=()
|
|
for wt in "${wtypes[@]}"; do
|
|
wfiles+=("")
|
|
done
|
|
|
|
# map wtype input to index
|
|
if [[ ! -z "$wtype" ]]; then
|
|
iw=-1
|
|
is=0
|
|
for wt in "${wtypes[@]}"; do
|
|
# uppercase
|
|
uwt=$(echo "$wt" | tr '[:lower:]' '[:upper:]')
|
|
if [[ "$uwt" == "$wtype" ]]; then
|
|
iw=$is
|
|
break
|
|
fi
|
|
is=$((is+1))
|
|
done
|
|
|
|
if [[ $iw -eq -1 ]]; then
|
|
printf "[-] Invalid weight type: %s\n" "$wtype"
|
|
exit 1
|
|
fi
|
|
|
|
wtype="$iw"
|
|
fi
|
|
|
|
# sample repos
|
|
repos=(
|
|
"https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
|
|
"https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
|
|
"https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
|
|
"https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
|
|
"https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
|
|
"https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
|
|
"https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
|
|
"https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
|
|
"https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
|
|
"https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
|
|
)
|
|
if [ $is_interactive -eq 1 ]; then
|
|
printf "\n"
|
|
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
|
|
printf " Based on the options that follow, the script might download a model file\n"
|
|
printf " from the internet, which can be a few GBs in size. The script will also\n"
|
|
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
|
|
printf "\n"
|
|
printf " Upon success, an HTTP server will be started and it will serve the selected\n"
|
|
printf " model using llama.cpp for demonstration purposes.\n"
|
|
printf "\n"
|
|
printf " Please note:\n"
|
|
printf "\n"
|
|
printf " - All new data will be stored in the current folder\n"
|
|
printf " - The server will be listening on all network interfaces\n"
|
|
printf " - The server will run with default settings which are not always optimal\n"
|
|
printf " - Do not judge the quality of a model based on the results from this script\n"
|
|
printf " - Do not use this script to benchmark llama.cpp\n"
|
|
printf " - Do not use this script in production\n"
|
|
printf " - This script is only for demonstration purposes\n"
|
|
printf "\n"
|
|
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
|
|
printf "\n"
|
|
printf " Press Enter to continue ...\n\n"
|
|
|
|
read
|
|
fi
|
|
|
|
if [[ -z "$repo" ]]; then
|
|
printf "[+] No repo provided from the command line\n"
|
|
printf " Please select a number from the list below or enter an URL:\n\n"
|
|
|
|
is=0
|
|
for r in "${repos[@]}"; do
|
|
printf " %2d) %s\n" $is "$r"
|
|
is=$((is+1))
|
|
done
|
|
|
|
# ask for repo until index of sample repo is provided or an URL
|
|
while [[ -z "$repo" ]]; do
|
|
printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
|
|
read -p "[+] Select repo: " repo
|
|
|
|
# check if the input is a number
|
|
if [[ "$repo" =~ ^[0-9]+$ ]]; then
|
|
if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
|
|
repo="${repos[$repo]}"
|
|
else
|
|
printf "[-] Invalid repo index: %s\n" "$repo"
|
|
repo=""
|
|
fi
|
|
elif [[ "$repo" =~ ^https?:// ]]; then
|
|
repo="$repo"
|
|
else
|
|
printf "[-] Invalid repo URL: %s\n" "$repo"
|
|
repo=""
|
|
fi
|
|
done
|
|
fi
|
|
|
|
# remove suffix
|
|
repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
|
|
|
|
printf "[+] Checking for GGUF model files in %s\n" "$repo"
|
|
|
|
# find GGUF files in the source
|
|
# TODO: better logic
|
|
model_tree="${repo%/}/tree/main"
|
|
model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
|
|
|
|
# list all files in the provided git repo
|
|
printf "[+] Model files:\n\n"
|
|
for file in $model_files; do
|
|
# determine iw by grepping the filename with wtypes
|
|
iw=-1
|
|
is=0
|
|
for wt in "${wtypes[@]}"; do
|
|
# uppercase
|
|
ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
|
|
if [[ "$ufile" =~ "$wt" ]]; then
|
|
iw=$is
|
|
break
|
|
fi
|
|
is=$((is+1))
|
|
done
|
|
|
|
if [[ $iw -eq -1 ]]; then
|
|
continue
|
|
fi
|
|
|
|
wfiles[$iw]="$file"
|
|
|
|
have=" "
|
|
if [[ -f "$file" ]]; then
|
|
have="*"
|
|
fi
|
|
|
|
printf " %2d) %s %s\n" $iw "$have" "$file"
|
|
done
|
|
|
|
wfile="${wfiles[$wtype]}"
|
|
|
|
# ask for weights type until provided and available
|
|
while [[ -z "$wfile" ]]; do
|
|
printf "\n"
|
|
read -p "[+] Select weight type: " wtype
|
|
wfile="${wfiles[$wtype]}"
|
|
|
|
if [[ -z "$wfile" ]]; then
|
|
printf "[-] Invalid weight type: %s\n" "$wtype"
|
|
wtype=""
|
|
fi
|
|
done
|
|
|
|
printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
|
|
|
|
url="${repo%/}/resolve/main/$wfile"
|
|
|
|
# check file if the model has been downloaded before
|
|
chk="$wfile.chk"
|
|
|
|
# check if we should download the file
|
|
# - if $wfile does not exist
|
|
# - if $wfile exists but $chk does not exist
|
|
# - if $wfile exists and $chk exists but $wfile is newer than $chk
|
|
# TODO: better logic using git lfs info
|
|
|
|
do_download=0
|
|
|
|
if [[ ! -f "$wfile" ]]; then
|
|
do_download=1
|
|
elif [[ ! -f "$chk" ]]; then
|
|
do_download=1
|
|
elif [[ "$wfile" -nt "$chk" ]]; then
|
|
do_download=1
|
|
fi
|
|
|
|
if [[ $do_download -eq 1 ]]; then
|
|
printf "[+] Downloading weights from %s\n" "$url"
|
|
|
|
# download the weights file
|
|
curl -o "$wfile" -# -L "$url"
|
|
|
|
# create a check file if successful
|
|
if [[ $? -eq 0 ]]; then
|
|
printf "[+] Creating check file %s\n" "$chk"
|
|
touch "$chk"
|
|
fi
|
|
else
|
|
printf "[+] Using cached weights %s\n" "$wfile"
|
|
fi
|
|
|
|
# get latest llama.cpp and build
|
|
|
|
printf "[+] Downloading latest llama.cpp\n"
|
|
|
|
llama_cpp_dir="__llama_cpp_port_${port}__"
|
|
|
|
if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
|
|
# if the dir exists and there isn't a file "__ggml_script__" in it, abort
|
|
printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
|
|
printf "[-] Please remove it and try again\n"
|
|
exit 1
|
|
elif [[ -d "$llama_cpp_dir" ]]; then
|
|
printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
|
|
printf "[+] Using cached llama.cpp\n"
|
|
|
|
cd "$llama_cpp_dir"
|
|
git reset --hard
|
|
git fetch
|
|
git checkout origin/master
|
|
|
|
cd ..
|
|
else
|
|
printf "[+] Cloning llama.cpp\n"
|
|
|
|
git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
|
|
fi
|
|
|
|
# mark that that the directory is made by this script
|
|
touch "$llama_cpp_dir/__ggml_script__"
|
|
|
|
if [[ $verbose -eq 1 ]]; then
|
|
set -x
|
|
fi
|
|
|
|
# build
|
|
cd "$llama_cpp_dir"
|
|
|
|
make clean
|
|
|
|
log="--silent"
|
|
if [[ $verbose -eq 1 ]]; then
|
|
log=""
|
|
fi
|
|
|
|
if [[ "$backend" == "cuda" ]]; then
|
|
printf "[+] Building with CUDA backend\n"
|
|
GGML_CUDA=1 make -j llama-server $log
|
|
elif [[ "$backend" == "cpu" ]]; then
|
|
printf "[+] Building with CPU backend\n"
|
|
make -j llama-server $log
|
|
elif [[ "$backend" == "metal" ]]; then
|
|
printf "[+] Building with Metal backend\n"
|
|
make -j llama-server $log
|
|
else
|
|
printf "[-] Unknown backend: %s\n" "$backend"
|
|
exit 1
|
|
fi
|
|
|
|
# run the server
|
|
|
|
printf "[+] Running server\n"
|
|
|
|
args=""
|
|
if [[ "$backend" == "cuda" ]]; then
|
|
export CUDA_VISIBLE_DEVICES=$gpu_id
|
|
args="-ngl 999"
|
|
elif [[ "$backend" == "cpu" ]]; then
|
|
args="-ngl 0"
|
|
elif [[ "$backend" == "metal" ]]; then
|
|
args="-ngl 999"
|
|
else
|
|
printf "[-] Unknown backend: %s\n" "$backend"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ $verbose -eq 1 ]]; then
|
|
args="$args --verbose"
|
|
fi
|
|
|
|
./llama-server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
|
|
|
|
exit 0
|