llama.cpp/scripts/deploy-server.sh

#!/bin/bash
#
# Helper script for deploying llama.cpp server with a single Bash command
#
# - Works on Linux and macOS
# - Supports: CPU, CUDA, Metal, OpenCL
# - Can run all GGUF models from HuggingFace
# - Always build latest llama.cpp from GitHub
# - Might be unstable!
#
# Usage:
#   ./deploy-server.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
#
#   --port:       port number, default is 8888
#   --repo:       path to a repo containing GGUF model files
#   --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input
#   --backend:    cpu, cuda, metal, opencl, depends on the OS
#   --gpu-id:     gpu id, default is 0
#   --n-parallel: number of parallel requests, default is 8
#   --n-kv:       KV cache size, default is 4096
#   --verbose:    verbose output
#
# Example:
#
#   curl https://ggml.ai/deploy-server.sh | bash -s --
#

set -e

# required utils: curl, git, make
if ! command -v curl &> /dev/null; then
    printf "[-] curl not found\n"
    exit 1
fi
if ! command -v git &> /dev/null; then
    printf "[-] git not found\n"
    exit 1
fi
if ! command -v make &> /dev/null; then
    printf "[-] make not found\n"
    exit 1
fi

# parse arguments
port=8888
repo=""
wtype=""
backend="cpu"

# if macOS, use metal backend by default
if [[ "$OSTYPE" == "darwin"* ]]; then
    backend="metal"
elif command -v nvcc &> /dev/null; then
    backend="cuda"
fi

gpu_id=0
n_parallel=8
n_kv=4096
verbose=0

while [[ $# -gt 0 ]]; do
    key="$1"
    case $key in
        --port)
            port="$2"
            shift
            shift
            ;;
        --repo)
            repo="$2"
            shift
            shift
            ;;
        --wtype)
            wtype="$2"
            shift
            shift
            ;;
        --backend)
            backend="$2"
            shift
            shift
            ;;
        --gpu-id)
            gpu_id="$2"
            shift
            shift
            ;;
        --n-parallel)
            n_parallel="$2"
            shift
            shift
            ;;
        --n-kv)
            n_kv="$2"
            shift
            shift
            ;;
        --verbose)
            verbose=1
            shift
            ;;
        *)
            echo "Unknown argument: $key"
            exit 1
            ;;
    esac
done

# available weights types
wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")

wfiles=()
for wt in "${wtypes[@]}"; do
    wfiles+=("")
done

# sample repos
repos=(
    "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
    "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
    "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
    "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
    "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
    "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
    "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
)

printf "\n"
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
printf "    Based on the options that follow, the script might download a model file\n"
printf "    from the internet, which can be a few GBs in size. The script will also\n"
printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n\n"
printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n\n"
printf "    Press Enter to continue ...\n\n"

read

if [[ -z "$repo" ]]; then
    printf "[+] No repo provided from the command line\n"
    printf "    Please select a number from the sample repos below or enter an URL:\n\n"

    is=0
    for r in "${repos[@]}"; do
        printf "    %2d) %s\n" $is "$r"
        is=$((is+1))
    done

    # ask for repo until index of sample repo is provided or an URL
    while [[ -z "$repo" ]]; do
        printf "\n    Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
        read -p "[+] Select repo: " repo

        # check if the input is a number
        if [[ "$repo" =~ ^[0-9]+$ ]]; then
            if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
                repo="${repos[$repo]}"
            else
                printf "[-] Invalid repo index: %s\n" "$repo"
                repo=""
            fi
        elif [[ "$repo" =~ ^https?:// ]]; then
            repo="$repo"
        else
            printf "[-] Invalid repo URL: %s\n" "$repo"
            repo=""
        fi
    done
fi

# remove suffix
repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')

printf "[+] Checking for GGUF model files in %s\n" "$repo"

model_tree="${repo%/}/tree/main"
model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')

# list all files in the provided git repo
printf "[+] Model files:\n\n"
for file in $model_files; do
    # determine iw by grepping the filename with wtypes
    iw=-1
    is=0
    for wt in "${wtypes[@]}"; do
        # uppercase
        ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
        if [[ "$ufile" =~ "$wt" ]]; then
            iw=$is
            break
        fi
        is=$((is+1))
    done

    if [[ $iw -eq -1 ]]; then
        continue
    fi

    wfiles[$iw]="$file"

    have=" "
    if [[ -f "$file" ]]; then
        have="*"
    fi

    printf "    %2d) %s %s\n" $iw "$have" "$file"
done

# ask for weights type until provided and available
while [[ -z "$wtype" ]]; do
    printf "\n"
    read -p "[+] Select weight type: " wtype
    wfile="${wfiles[$wtype]}"

    if [[ -z "$wfile" ]]; then
        printf "[-] Invalid weight type: %s\n" "$wtype"
        wtype=""
    fi
done

printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"

url="${repo%/}/resolve/main/$wfile"

# check file if the model has been downloaded before
chk="$wfile.chk"

# check if we should download the file
# - if $wfile does not exist
# - if $wfile exists but $chk does not exist
# - if $wfile exists and $chk exists but $wfile is newer than $chk
do_download=0
if [[ ! -f "$wfile" ]]; then
    do_download=1
elif [[ ! -f "$chk" ]]; then
    do_download=1
elif [[ "$wfile" -nt "$chk" ]]; then
    do_download=1
fi

if [[ $do_download -eq 1 ]]; then
    printf "[+] Downloading weights from %s\n" "$url"

    # download the weights file
    curl -o "$wfile" -# -L "$url"

    # create a check file if successful
    if [[ $? -eq 0 ]]; then
        printf "[+] Creating check file %s\n" "$chk"
        touch "$chk"
    fi
else
    printf "[+] Using cached weights %s\n" "$wfile"
fi

# get latest llama.cpp and build

printf "[+] Downloading latest llama.cpp\n"

llama_cpp_dir="__llama_cpp_port_${port}__"

if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
    # if the dir exists and there isn't a file "__ggml_script__" in it, abort
    printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
    printf "[-] Please remove it and try again\n"
    exit 1
elif [[ -d "$llama_cpp_dir" ]]; then
    printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
    printf "[+] Using cached llama.cpp\n"

    cd "$llama_cpp_dir"
    git reset --hard
    git fetch
    git checkout origin/master

    cd ..
else
    git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
fi

# mark that that the directory is made by this script
touch "$llama_cpp_dir/__ggml_script__"

if [[ $verbose -eq 1 ]]; then
    set -x
fi

# build
cd "$llama_cpp_dir"

make clean

log="--silent"
if [[ $verbose -eq 1 ]]; then
    log=""
fi

if [[ "$backend" == "cuda" ]]; then
    printf "[+] Building with CUDA backend\n"
    LLAMA_CUBLAS=1 make -j server $log
elif [[ "$backend" == "cpu" ]]; then
    printf "[+] Building with CPU backend\n"
    make -j server $log
elif [[ "$backend" == "metal" ]]; then
    printf "[+] Building with Metal backend\n"
    make -j server $log
elif [[ "$backend" == "opencl" ]]; then
    printf "[+] Building with OpenCL backend\n"
    LLAMA_CLBLAST=1 make -j server $log
else
    printf "[-] Unknown backend: %s\n" "$backend"
    exit 1
fi

# run the server

printf "[+] Running server\n"

args=""
if [[ "$backend" == "cuda" ]]; then
    export CUDA_VISIBLE_DEVICES=$gpu_id
    args="-ngl 999"
elif [[ "$backend" == "cpu" ]]; then
    args="-ngl 0"
elif [[ "$backend" == "metal" ]]; then
    args="-ngl 999"
elif [[ "$backend" == "opencl" ]]; then
    args="-ngl 999"
else
    printf "[-] Unknown backend: %s\n" "$backend"
    exit 1
fi

if [[ $verbose -eq 1 ]]; then
    args="$args --verbose"
fi

./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args

exit 0
scripts : add deploy-server.sh 2023-10-31 09:29:23 +00:00			`#!/bin/bash`
			`#`
			`# Helper script for deploying llama.cpp server with a single Bash command`
			`#`
			`# - Works on Linux and macOS`
			`# - Supports: CPU, CUDA, Metal, OpenCL`
			`# - Can run all GGUF models from HuggingFace`
			`# - Always build latest llama.cpp from GitHub`
			`# - Might be unstable!`
			`#`
			`# Usage:`
			`# ./deploy-server.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]`
			`#`
			`# --port: port number, default is 8888`
			`# --repo: path to a repo containing GGUF model files`
			`# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input`
			`# --backend: cpu, cuda, metal, opencl, depends on the OS`
			`# --gpu-id: gpu id, default is 0`
			`# --n-parallel: number of parallel requests, default is 8`
			`# --n-kv: KV cache size, default is 4096`
			`# --verbose: verbose output`
			`#`
			`# Example:`
			`#`
			`# curl https://ggml.ai/deploy-server.sh \| bash -s --`
			`#`

			`set -e`

			`# required utils: curl, git, make`
			`if ! command -v curl &> /dev/null; then`
			`printf "[-] curl not found\n"`
			`exit 1`
			`fi`
			`if ! command -v git &> /dev/null; then`
			`printf "[-] git not found\n"`
			`exit 1`
			`fi`
			`if ! command -v make &> /dev/null; then`
			`printf "[-] make not found\n"`
			`exit 1`
			`fi`

			`# parse arguments`
			`port=8888`
			`repo=""`
			`wtype=""`
			`backend="cpu"`

			`# if macOS, use metal backend by default`
			`if [[ "$OSTYPE" == "darwin"* ]]; then`
			`backend="metal"`
			`elif command -v nvcc &> /dev/null; then`
			`backend="cuda"`
			`fi`

			`gpu_id=0`
			`n_parallel=8`
			`n_kv=4096`
			`verbose=0`

			`while [[ $# -gt 0 ]]; do`
			`key="$1"`
			`case $key in`
			`--port)`
			`port="$2"`
			`shift`
			`shift`
			`;;`
			`--repo)`
			`repo="$2"`
			`shift`
			`shift`
			`;;`
			`--wtype)`
			`wtype="$2"`
			`shift`
			`shift`
			`;;`
			`--backend)`
			`backend="$2"`
			`shift`
			`shift`
			`;;`
			`--gpu-id)`
			`gpu_id="$2"`
			`shift`
			`shift`
			`;;`
			`--n-parallel)`
			`n_parallel="$2"`
			`shift`
			`shift`
			`;;`
			`--n-kv)`
			`n_kv="$2"`
			`shift`
			`shift`
			`;;`
			`--verbose)`
			`verbose=1`
			`shift`
			`;;`
			`*)`
			`echo "Unknown argument: $key"`
			`exit 1`
			`;;`
			`esac`
			`done`

			`# available weights types`
			`wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")`

			`wfiles=()`
			`for wt in "${wtypes[@]}"; do`
			`wfiles+=("")`
			`done`

			`# sample repos`
			`repos=(`
			`"https://huggingface.co/TheBloke/Llama-2-7B-GGUF"`
			`"https://huggingface.co/TheBloke/Llama-2-13B-GGUF"`
			`"https://huggingface.co/TheBloke/Llama-2-70B-GGUF"`
			`"https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"`
			`"https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"`
			`"https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"`
			`"https://huggingface.co/TheBloke/CausalLM-7B-GGUF"`
			`)`

			`printf "\n"`
			`printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"`
			`printf " Based on the options that follow, the script might download a model file\n"`
			`printf " from the internet, which can be a few GBs in size. The script will also\n"`
			`printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n\n"`
			`printf " If you don't know what you are doing, please press Ctrl-C to abort now\n\n"`
			`printf " Press Enter to continue ...\n\n"`

			`read`

			`if [[ -z "$repo" ]]; then`
			`printf "[+] No repo provided from the command line\n"`
			`printf " Please select a number from the sample repos below or enter an URL:\n\n"`

			`is=0`
			`for r in "${repos[@]}"; do`
			`printf " %2d) %s\n" $is "$r"`
			`is=$((is+1))`
			`done`

			`# ask for repo until index of sample repo is provided or an URL`
			`while [[ -z "$repo" ]]; do`
			`printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"`
			`read -p "[+] Select repo: " repo`

			`# check if the input is a number`
			`if [[ "$repo" =~ ^[0-9]+$ ]]; then`
			`if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then`
			`repo="${repos[$repo]}"`
			`else`
			`printf "[-] Invalid repo index: %s\n" "$repo"`
			`repo=""`
			`fi`
			`elif [[ "$repo" =~ ^https?:// ]]; then`
			`repo="$repo"`
			`else`
			`printf "[-] Invalid repo URL: %s\n" "$repo"`
			`repo=""`
			`fi`
			`done`
			`fi`

			`# remove suffix`
			`repo=$(echo "$repo" \| sed -E 's/\/tree\/main$//g')`

			`printf "[+] Checking for GGUF model files in %s\n" "$repo"`

			`model_tree="${repo%/}/tree/main"`
			`model_files=$(curl -s "$model_tree" \| grep -i "\\.gguf</span>" \| sed -E 's/.<span class="truncate group-hover:underline">(.)<\/span><\/a>/\1/g')`

			`# list all files in the provided git repo`
			`printf "[+] Model files:\n\n"`
			`for file in $model_files; do`
			`# determine iw by grepping the filename with wtypes`
			`iw=-1`
			`is=0`
			`for wt in "${wtypes[@]}"; do`
			`# uppercase`
			`ufile=$(echo "$file" \| tr '[:lower:]' '[:upper:]')`
			`if [[ "$ufile" =~ "$wt" ]]; then`
			`iw=$is`
			`break`
			`fi`
			`is=$((is+1))`
			`done`

			`if [[ $iw -eq -1 ]]; then`
			`continue`
			`fi`

			`wfiles[$iw]="$file"`

			`have=" "`
			`if [[ -f "$file" ]]; then`
			`have="*"`
			`fi`

			`printf " %2d) %s %s\n" $iw "$have" "$file"`
			`done`

			`# ask for weights type until provided and available`
			`while [[ -z "$wtype" ]]; do`
			`printf "\n"`
			`read -p "[+] Select weight type: " wtype`
			`wfile="${wfiles[$wtype]}"`

			`if [[ -z "$wfile" ]]; then`
			`printf "[-] Invalid weight type: %s\n" "$wtype"`
			`wtype=""`
			`fi`
			`done`

			`printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"`

			`url="${repo%/}/resolve/main/$wfile"`

			`# check file if the model has been downloaded before`
			`chk="$wfile.chk"`

			`# check if we should download the file`
			`# - if $wfile does not exist`
			`# - if $wfile exists but $chk does not exist`
			`# - if $wfile exists and $chk exists but $wfile is newer than $chk`
			`do_download=0`
			`if [[ ! -f "$wfile" ]]; then`
			`do_download=1`
			`elif [[ ! -f "$chk" ]]; then`
			`do_download=1`
			`elif [[ "$wfile" -nt "$chk" ]]; then`
			`do_download=1`
			`fi`

			`if [[ $do_download -eq 1 ]]; then`
			`printf "[+] Downloading weights from %s\n" "$url"`

			`# download the weights file`
			`curl -o "$wfile" -# -L "$url"`

			`# create a check file if successful`
			`if [[ $? -eq 0 ]]; then`
			`printf "[+] Creating check file %s\n" "$chk"`
			`touch "$chk"`
			`fi`
			`else`
			`printf "[+] Using cached weights %s\n" "$wfile"`
			`fi`

			`# get latest llama.cpp and build`

			`printf "[+] Downloading latest llama.cpp\n"`

			`llama_cpp_dir="__llama_cpp_port_${port}__"`

			`if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then`
			`# if the dir exists and there isn't a file "__ggml_script__" in it, abort`
			`printf "[-] Directory %s already exists\n" "$llama_cpp_dir"`
			`printf "[-] Please remove it and try again\n"`
			`exit 1`
			`elif [[ -d "$llama_cpp_dir" ]]; then`
			`printf "[+] Directory %s already exists\n" "$llama_cpp_dir"`
			`printf "[+] Using cached llama.cpp\n"`

			`cd "$llama_cpp_dir"`
			`git reset --hard`
			`git fetch`
			`git checkout origin/master`

			`cd ..`
			`else`
			`git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"`
			`fi`

			`# mark that that the directory is made by this script`
			`touch "$llama_cpp_dir/__ggml_script__"`

			`if [[ $verbose -eq 1 ]]; then`
			`set -x`
			`fi`

			`# build`
			`cd "$llama_cpp_dir"`

			`make clean`

			`log="--silent"`
			`if [[ $verbose -eq 1 ]]; then`
			`log=""`
			`fi`

			`if [[ "$backend" == "cuda" ]]; then`
			`printf "[+] Building with CUDA backend\n"`
			`LLAMA_CUBLAS=1 make -j server $log`
			`elif [[ "$backend" == "cpu" ]]; then`
			`printf "[+] Building with CPU backend\n"`
			`make -j server $log`
			`elif [[ "$backend" == "metal" ]]; then`
			`printf "[+] Building with Metal backend\n"`
			`make -j server $log`
			`elif [[ "$backend" == "opencl" ]]; then`
			`printf "[+] Building with OpenCL backend\n"`
			`LLAMA_CLBLAST=1 make -j server $log`
			`else`
			`printf "[-] Unknown backend: %s\n" "$backend"`
			`exit 1`
			`fi`

			`# run the server`

			`printf "[+] Running server\n"`

			`args=""`
			`if [[ "$backend" == "cuda" ]]; then`
			`export CUDA_VISIBLE_DEVICES=$gpu_id`
			`args="-ngl 999"`
			`elif [[ "$backend" == "cpu" ]]; then`
			`args="-ngl 0"`
			`elif [[ "$backend" == "metal" ]]; then`
			`args="-ngl 999"`
			`elif [[ "$backend" == "opencl" ]]; then`
			`args="-ngl 999"`
			`else`
			`printf "[-] Unknown backend: %s\n" "$backend"`
			`exit 1`
			`fi`

			`if [[ $verbose -eq 1 ]]; then`
			`args="$args --verbose"`
			`fi`

			`./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args`

			`exit 0`