From f3947e1e0262b68b6032b00b670db2d34fb77959 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 31 Oct 2023 13:58:18 +0200
Subject: [PATCH] scripts : rename to server-llm.sh

---
 scripts/{deploy-server.sh => server-llm.sh} | 62 +++++++++++++++++++--
 1 file changed, 56 insertions(+), 6 deletions(-)
 rename scripts/{deploy-server.sh => server-llm.sh} (77%)

diff --git a/scripts/deploy-server.sh b/scripts/server-llm.sh
similarity index 77%
rename from scripts/deploy-server.sh
rename to scripts/server-llm.sh
index 66b9def06..3a6c4c840 100644
--- a/scripts/deploy-server.sh
+++ b/scripts/server-llm.sh
@@ -5,11 +5,16 @@
 # - Works on Linux and macOS
 # - Supports: CPU, CUDA, Metal, OpenCL
 # - Can run all GGUF models from HuggingFace
-# - Always build latest llama.cpp from GitHub
+# - Can serve requests in parallel
+# - Always builds latest llama.cpp from GitHub
+#
+# Limitations
+#
+# - Chat templates are poorly supported (base models recommended)
 # - Might be unstable!
 #
 # Usage:
-#   ./deploy-server.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
+#   ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
 #
 #   --port:       port number, default is 8888
 #   --repo:       path to a repo containing GGUF model files
@@ -22,7 +27,7 @@
 #
 # Example:
 #
-#   curl https://ggml.ai/deploy-server.sh | bash -s --
+#   bash <(curl https://ggml.ai/server-llm.sh)
 #
 
 set -e
@@ -59,6 +64,21 @@ n_parallel=8
 n_kv=4096
 verbose=0
 
+function print_usage {
+    printf "Usage:\n"
+    printf "  ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
+    printf "  --port:       port number, default is 8888\n"
+    printf "  --repo:       path to a repo containing GGUF model files\n"
+    printf "  --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
+    printf "  --backend:    cpu, cuda, metal, opencl, depends on the OS\n"
+    printf "  --gpu-id:     gpu id, default is 0\n"
+    printf "  --n-parallel: number of parallel requests, default is 8\n"
+    printf "  --n-kv:       KV cache size, default is 4096\n"
+    printf "  --verbose:    verbose output\n\n"
+    printf "Example:\n\n"
+    printf "  bash <(curl https://ggml.ai/server-llm.sh)\n\n"
+}
+
 while [[ $# -gt 0 ]]; do
     key="$1"
     case $key in
@@ -101,8 +121,13 @@ while [[ $# -gt 0 ]]; do
             verbose=1
             shift
             ;;
+        --help)
+            print_usage
+            exit 0
+            ;;
         *)
             echo "Unknown argument: $key"
+            print_usage
             exit 1
             ;;
     esac
@@ -121,6 +146,9 @@ repos=(
     "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
     "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
     "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
+    "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
+    "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
+    "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
     "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
     "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
     "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
@@ -131,15 +159,30 @@ printf "\n"
 printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
 printf "    Based on the options that follow, the script might download a model file\n"
 printf "    from the internet, which can be a few GBs in size. The script will also\n"
-printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n\n"
-printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n\n"
+printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n"
+printf "\n"
+printf "    Upon success, an HTTP server will be started and it will serve the selected\n"
+printf "    model using llama.cpp for demonstration purposes.\n"
+printf "\n"
+printf "    Please note:\n"
+printf "\n"
+printf "    - All new data will be stored in the current folder\n"
+printf "    - The server will be listening on all network interfaces\n"
+printf "    - The server will run with default settings which are not always optimal\n"
+printf "    - Do not judge the quality of a model based on the results from this script\n"
+printf "    - Do not use this script to benchmark llama.cpp\n"
+printf "    - Do not use this script in production\n"
+printf "    - This script is only for demonstration purposes\n"
+printf "\n"
+printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n"
+printf "\n"
 printf "    Press Enter to continue ...\n\n"
 
 read
 
 if [[ -z "$repo" ]]; then
     printf "[+] No repo provided from the command line\n"
-    printf "    Please select a number from the sample repos below or enter an URL:\n\n"
+    printf "    Please select a number from the list below or enter an URL:\n\n"
 
     is=0
     for r in "${repos[@]}"; do
@@ -174,6 +217,8 @@ repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
 
 printf "[+] Checking for GGUF model files in %s\n" "$repo"
 
+# find GGUF files in the source
+# TODO: better logic
 model_tree="${repo%/}/tree/main"
 model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
 
@@ -230,7 +275,10 @@ chk="$wfile.chk"
 # - if $wfile does not exist
 # - if $wfile exists but $chk does not exist
 # - if $wfile exists and $chk exists but $wfile is newer than $chk
+# TODO: better logic using git lfs info
+
 do_download=0
+
 if [[ ! -f "$wfile" ]]; then
     do_download=1
 elif [[ ! -f "$chk" ]]; then
@@ -276,6 +324,8 @@ elif [[ -d "$llama_cpp_dir" ]]; then
 
     cd ..
 else
+    printf "[+] Cloning llama.cpp\n"
+
     git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
 fi