#!/bin/bash # # Helper script for deploying llama.cpp server with a single Bash command # # - Works on Linux and macOS # - Supports: CPU, CUDA, Metal # - Can run all GGUF models from HuggingFace # - Can serve requests in parallel # - Always builds latest llama.cpp from GitHub # # Limitations # # - Chat templates are poorly supported (base models recommended) # - Might be unstable! # # Usage: # ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive] # # --port: port number, default is 8888 # --repo: path to a repo containing GGUF model files # --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input # --backend: cpu, cuda, metal, depends on the OS # --gpu-id: gpu id, default is 0 # --n-parallel: number of parallel requests, default is 8 # --n-kv: KV cache size, default is 4096 # --verbose: verbose output # --non-interactive: run without asking a permission to run # # Example: # # bash -c "$(curl -s https://ggml.ai/server-llm.sh)" # set -e # required utils: curl, git, make if ! command -v curl &> /dev/null; then printf "[-] curl not found\n" exit 1 fi if ! command -v git &> /dev/null; then printf "[-] git not found\n" exit 1 fi if ! command -v make &> /dev/null; then printf "[-] make not found\n" exit 1 fi # parse arguments is_interactive=1 port=8888 repo="" wtype="" backend="cpu" # if macOS, use metal backend by default if [[ "$OSTYPE" == "darwin"* ]]; then backend="metal" elif command -v nvcc &> /dev/null; then backend="cuda" fi gpu_id=0 n_parallel=8 n_kv=4096 verbose=0 function print_usage { printf "Usage:\n" printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]\n\n" printf " --port: port number, default is 8888\n" printf " --repo: path to a repo containing GGUF model files\n" printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n" printf " --backend: cpu, cuda, metal, depends on the OS\n" printf " --gpu-id: gpu id, default is 0\n" printf " --n-parallel: number of parallel requests, default is 8\n" printf " --n-kv: KV cache size, default is 4096\n" printf " --verbose: verbose output\n\n" printf " --non-interactive: run without asking a permission to run\n" printf "Example:\n\n" printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n' } while [[ $# -gt 0 ]]; do key="$1" case $key in --non-interactive) is_interactive=0 shift ;; --port) port="$2" shift shift ;; --repo) repo="$2" shift shift ;; --wtype) wtype="$2" shift shift ;; --backend) backend="$2" shift shift ;; --gpu-id) gpu_id="$2" shift shift ;; --n-parallel) n_parallel="$2" shift shift ;; --n-kv) n_kv="$2" shift shift ;; --verbose) verbose=1 shift ;; --help) print_usage exit 0 ;; *) echo "Unknown argument: $key" print_usage exit 1 ;; esac done # available weights types wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K") wfiles=() for wt in "${wtypes[@]}"; do wfiles+=("") done # map wtype input to index if [[ ! -z "$wtype" ]]; then iw=-1 is=0 for wt in "${wtypes[@]}"; do # uppercase uwt=$(echo "$wt" | tr '[:lower:]' '[:upper:]') if [[ "$uwt" == "$wtype" ]]; then iw=$is break fi is=$((is+1)) done if [[ $iw -eq -1 ]]; then printf "[-] Invalid weight type: %s\n" "$wtype" exit 1 fi wtype="$iw" fi # sample repos repos=( "https://huggingface.co/TheBloke/Llama-2-7B-GGUF" "https://huggingface.co/TheBloke/Llama-2-13B-GGUF" "https://huggingface.co/TheBloke/Llama-2-70B-GGUF" "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF" "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF" "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF" "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF" "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF" "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF" "https://huggingface.co/TheBloke/CausalLM-7B-GGUF" ) if [ $is_interactive -eq 1 ]; then printf "\n" printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n" printf " Based on the options that follow, the script might download a model file\n" printf " from the internet, which can be a few GBs in size. The script will also\n" printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n" printf "\n" printf " Upon success, an HTTP server will be started and it will serve the selected\n" printf " model using llama.cpp for demonstration purposes.\n" printf "\n" printf " Please note:\n" printf "\n" printf " - All new data will be stored in the current folder\n" printf " - The server will be listening on all network interfaces\n" printf " - The server will run with default settings which are not always optimal\n" printf " - Do not judge the quality of a model based on the results from this script\n" printf " - Do not use this script to benchmark llama.cpp\n" printf " - Do not use this script in production\n" printf " - This script is only for demonstration purposes\n" printf "\n" printf " If you don't know what you are doing, please press Ctrl-C to abort now\n" printf "\n" printf " Press Enter to continue ...\n\n" read fi if [[ -z "$repo" ]]; then printf "[+] No repo provided from the command line\n" printf " Please select a number from the list below or enter an URL:\n\n" is=0 for r in "${repos[@]}"; do printf " %2d) %s\n" $is "$r" is=$((is+1)) done # ask for repo until index of sample repo is provided or an URL while [[ -z "$repo" ]]; do printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n" read -p "[+] Select repo: " repo # check if the input is a number if [[ "$repo" =~ ^[0-9]+$ ]]; then if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then repo="${repos[$repo]}" else printf "[-] Invalid repo index: %s\n" "$repo" repo="" fi elif [[ "$repo" =~ ^https?:// ]]; then repo="$repo" else printf "[-] Invalid repo URL: %s\n" "$repo" repo="" fi done fi # remove suffix repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g') printf "[+] Checking for GGUF model files in %s\n" "$repo" # find GGUF files in the source # TODO: better logic model_tree="${repo%/}/tree/main" model_files=$(curl -s "$model_tree" | grep -i "\\.gguf" | sed -E 's/.*(.*)<\/span><\/a>/\1/g') # list all files in the provided git repo printf "[+] Model files:\n\n" for file in $model_files; do # determine iw by grepping the filename with wtypes iw=-1 is=0 for wt in "${wtypes[@]}"; do # uppercase ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]') if [[ "$ufile" =~ "$wt" ]]; then iw=$is break fi is=$((is+1)) done if [[ $iw -eq -1 ]]; then continue fi wfiles[$iw]="$file" have=" " if [[ -f "$file" ]]; then have="*" fi printf " %2d) %s %s\n" $iw "$have" "$file" done wfile="${wfiles[$wtype]}" # ask for weights type until provided and available while [[ -z "$wfile" ]]; do printf "\n" read -p "[+] Select weight type: " wtype wfile="${wfiles[$wtype]}" if [[ -z "$wfile" ]]; then printf "[-] Invalid weight type: %s\n" "$wtype" wtype="" fi done printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile" url="${repo%/}/resolve/main/$wfile" # check file if the model has been downloaded before chk="$wfile.chk" # check if we should download the file # - if $wfile does not exist # - if $wfile exists but $chk does not exist # - if $wfile exists and $chk exists but $wfile is newer than $chk # TODO: better logic using git lfs info do_download=0 if [[ ! -f "$wfile" ]]; then do_download=1 elif [[ ! -f "$chk" ]]; then do_download=1 elif [[ "$wfile" -nt "$chk" ]]; then do_download=1 fi if [[ $do_download -eq 1 ]]; then printf "[+] Downloading weights from %s\n" "$url" # download the weights file curl -o "$wfile" -# -L "$url" # create a check file if successful if [[ $? -eq 0 ]]; then printf "[+] Creating check file %s\n" "$chk" touch "$chk" fi else printf "[+] Using cached weights %s\n" "$wfile" fi # get latest llama.cpp and build printf "[+] Downloading latest llama.cpp\n" llama_cpp_dir="__llama_cpp_port_${port}__" if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then # if the dir exists and there isn't a file "__ggml_script__" in it, abort printf "[-] Directory %s already exists\n" "$llama_cpp_dir" printf "[-] Please remove it and try again\n" exit 1 elif [[ -d "$llama_cpp_dir" ]]; then printf "[+] Directory %s already exists\n" "$llama_cpp_dir" printf "[+] Using cached llama.cpp\n" cd "$llama_cpp_dir" git reset --hard git fetch git checkout origin/master cd .. else printf "[+] Cloning llama.cpp\n" git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir" fi # mark that that the directory is made by this script touch "$llama_cpp_dir/__ggml_script__" if [[ $verbose -eq 1 ]]; then set -x fi # build cd "$llama_cpp_dir" make clean log="--silent" if [[ $verbose -eq 1 ]]; then log="" fi if [[ "$backend" == "cuda" ]]; then printf "[+] Building with CUDA backend\n" LLAMA_CUDA=1 make -j llama-server $log elif [[ "$backend" == "cpu" ]]; then printf "[+] Building with CPU backend\n" make -j llama-server $log elif [[ "$backend" == "metal" ]]; then printf "[+] Building with Metal backend\n" make -j llama-server $log else printf "[-] Unknown backend: %s\n" "$backend" exit 1 fi # run the server printf "[+] Running server\n" args="" if [[ "$backend" == "cuda" ]]; then export CUDA_VISIBLE_DEVICES=$gpu_id args="-ngl 999" elif [[ "$backend" == "cpu" ]]; then args="-ngl 0" elif [[ "$backend" == "metal" ]]; then args="-ngl 999" else printf "[-] Unknown backend: %s\n" "$backend" exit 1 fi if [[ $verbose -eq 1 ]]; then args="$args --verbose" fi ./llama-server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args exit 0