mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 11:24:35 +00:00
scripts : rename to server-llm.sh
This commit is contained in:
parent
2f719c876d
commit
f3947e1e02
@ -5,11 +5,16 @@
|
|||||||
# - Works on Linux and macOS
|
# - Works on Linux and macOS
|
||||||
# - Supports: CPU, CUDA, Metal, OpenCL
|
# - Supports: CPU, CUDA, Metal, OpenCL
|
||||||
# - Can run all GGUF models from HuggingFace
|
# - Can run all GGUF models from HuggingFace
|
||||||
# - Always build latest llama.cpp from GitHub
|
# - Can serve requests in parallel
|
||||||
|
# - Always builds latest llama.cpp from GitHub
|
||||||
|
#
|
||||||
|
# Limitations
|
||||||
|
#
|
||||||
|
# - Chat templates are poorly supported (base models recommended)
|
||||||
# - Might be unstable!
|
# - Might be unstable!
|
||||||
#
|
#
|
||||||
# Usage:
|
# Usage:
|
||||||
# ./deploy-server.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
|
# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
|
||||||
#
|
#
|
||||||
# --port: port number, default is 8888
|
# --port: port number, default is 8888
|
||||||
# --repo: path to a repo containing GGUF model files
|
# --repo: path to a repo containing GGUF model files
|
||||||
@ -22,7 +27,7 @@
|
|||||||
#
|
#
|
||||||
# Example:
|
# Example:
|
||||||
#
|
#
|
||||||
# curl https://ggml.ai/deploy-server.sh | bash -s --
|
# bash <(curl https://ggml.ai/server-llm.sh)
|
||||||
#
|
#
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
@ -59,6 +64,21 @@ n_parallel=8
|
|||||||
n_kv=4096
|
n_kv=4096
|
||||||
verbose=0
|
verbose=0
|
||||||
|
|
||||||
|
function print_usage {
|
||||||
|
printf "Usage:\n"
|
||||||
|
printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
|
||||||
|
printf " --port: port number, default is 8888\n"
|
||||||
|
printf " --repo: path to a repo containing GGUF model files\n"
|
||||||
|
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
|
||||||
|
printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
|
||||||
|
printf " --gpu-id: gpu id, default is 0\n"
|
||||||
|
printf " --n-parallel: number of parallel requests, default is 8\n"
|
||||||
|
printf " --n-kv: KV cache size, default is 4096\n"
|
||||||
|
printf " --verbose: verbose output\n\n"
|
||||||
|
printf "Example:\n\n"
|
||||||
|
printf " bash <(curl https://ggml.ai/server-llm.sh)\n\n"
|
||||||
|
}
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
key="$1"
|
key="$1"
|
||||||
case $key in
|
case $key in
|
||||||
@ -101,8 +121,13 @@ while [[ $# -gt 0 ]]; do
|
|||||||
verbose=1
|
verbose=1
|
||||||
shift
|
shift
|
||||||
;;
|
;;
|
||||||
|
--help)
|
||||||
|
print_usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
echo "Unknown argument: $key"
|
echo "Unknown argument: $key"
|
||||||
|
print_usage
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
@ -121,6 +146,9 @@ repos=(
|
|||||||
"https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
|
"https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
|
||||||
"https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
|
"https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
|
||||||
"https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
|
"https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
|
||||||
|
"https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
|
||||||
"https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
|
"https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
|
||||||
"https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
|
"https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
|
||||||
"https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
|
"https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
|
||||||
@ -131,15 +159,30 @@ printf "\n"
|
|||||||
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
|
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
|
||||||
printf " Based on the options that follow, the script might download a model file\n"
|
printf " Based on the options that follow, the script might download a model file\n"
|
||||||
printf " from the internet, which can be a few GBs in size. The script will also\n"
|
printf " from the internet, which can be a few GBs in size. The script will also\n"
|
||||||
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n\n"
|
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
|
||||||
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n\n"
|
printf "\n"
|
||||||
|
printf " Upon success, an HTTP server will be started and it will serve the selected\n"
|
||||||
|
printf " model using llama.cpp for demonstration purposes.\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " Please note:\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " - All new data will be stored in the current folder\n"
|
||||||
|
printf " - The server will be listening on all network interfaces\n"
|
||||||
|
printf " - The server will run with default settings which are not always optimal\n"
|
||||||
|
printf " - Do not judge the quality of a model based on the results from this script\n"
|
||||||
|
printf " - Do not use this script to benchmark llama.cpp\n"
|
||||||
|
printf " - Do not use this script in production\n"
|
||||||
|
printf " - This script is only for demonstration purposes\n"
|
||||||
|
printf "\n"
|
||||||
|
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
|
||||||
|
printf "\n"
|
||||||
printf " Press Enter to continue ...\n\n"
|
printf " Press Enter to continue ...\n\n"
|
||||||
|
|
||||||
read
|
read
|
||||||
|
|
||||||
if [[ -z "$repo" ]]; then
|
if [[ -z "$repo" ]]; then
|
||||||
printf "[+] No repo provided from the command line\n"
|
printf "[+] No repo provided from the command line\n"
|
||||||
printf " Please select a number from the sample repos below or enter an URL:\n\n"
|
printf " Please select a number from the list below or enter an URL:\n\n"
|
||||||
|
|
||||||
is=0
|
is=0
|
||||||
for r in "${repos[@]}"; do
|
for r in "${repos[@]}"; do
|
||||||
@ -174,6 +217,8 @@ repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
|
|||||||
|
|
||||||
printf "[+] Checking for GGUF model files in %s\n" "$repo"
|
printf "[+] Checking for GGUF model files in %s\n" "$repo"
|
||||||
|
|
||||||
|
# find GGUF files in the source
|
||||||
|
# TODO: better logic
|
||||||
model_tree="${repo%/}/tree/main"
|
model_tree="${repo%/}/tree/main"
|
||||||
model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
|
model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
|
||||||
|
|
||||||
@ -230,7 +275,10 @@ chk="$wfile.chk"
|
|||||||
# - if $wfile does not exist
|
# - if $wfile does not exist
|
||||||
# - if $wfile exists but $chk does not exist
|
# - if $wfile exists but $chk does not exist
|
||||||
# - if $wfile exists and $chk exists but $wfile is newer than $chk
|
# - if $wfile exists and $chk exists but $wfile is newer than $chk
|
||||||
|
# TODO: better logic using git lfs info
|
||||||
|
|
||||||
do_download=0
|
do_download=0
|
||||||
|
|
||||||
if [[ ! -f "$wfile" ]]; then
|
if [[ ! -f "$wfile" ]]; then
|
||||||
do_download=1
|
do_download=1
|
||||||
elif [[ ! -f "$chk" ]]; then
|
elif [[ ! -f "$chk" ]]; then
|
||||||
@ -276,6 +324,8 @@ elif [[ -d "$llama_cpp_dir" ]]; then
|
|||||||
|
|
||||||
cd ..
|
cd ..
|
||||||
else
|
else
|
||||||
|
printf "[+] Cloning llama.cpp\n"
|
||||||
|
|
||||||
git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
|
git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
|
||||||
fi
|
fi
|
||||||
|
|
Loading…
Reference in New Issue
Block a user