From 794db3e7b982fee37e3995db9c3a216a57ff65e3 Mon Sep 17 00:00:00 2001 From: Randall Fitzgerald Date: Sat, 17 Jun 2023 07:53:04 -0400 Subject: [PATCH] Server Example Refactor and Improvements (#1570) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A major rewrite for the server example. Note that if you have built something on the previous server API, it will probably be incompatible. Check out the examples for how a typical chat app could work. This took a lot of effort, there are 24 PR's closed in the submitter's repo alone, over 160 commits and a lot of comments and testing. Summary of the changes: - adds missing generation parameters: tfs_z, typical_p, repeat_last_n, repeat_penalty, presence_penalty, frequency_penalty, mirostat, penalize_nl, seed, ignore_eos - applies missing top k sampler - removes interactive mode/terminal-like behavior, removes exclude parameter - moves threads and batch size to server command-line parameters - adds LoRA loading and matches command line parameters with main example - fixes stopping on EOS token and with the specified token amount with n_predict - adds server timeouts, host, and port settings - adds expanded generation complete response; adds generation settings, stop reason, prompt truncated, model used, and final text - sets defaults for unspecified parameters between requests - removes /next-token endpoint and as_loop parameter, adds stream parameter and server-sent events for streaming - adds CORS headers to responses - adds request logging, exception printing and optional verbose logging - adds better stopping words handling when matching multiple tokens and while streaming, or when it finishes on a partial stop string - adds printing an error when it can't bind to the host/port specified - fixes multi-byte character handling and replaces invalid UTF-8 characters on responses - prints timing and build info on startup - adds logit bias to request parameters - removes embedding mode - updates documentation; adds streaming Node.js and Bash examples - fixes code formatting - sets server threads to 1 since the current global state doesn't work well with simultaneous requests - adds truncation of the input prompt and better context reset - removes token limit from the input prompt - significantly simplified the logic and removed a lot of variables --------- Co-authored-by: anon998 <131767832+anon998@users.noreply.github.com> Co-authored-by: Henri Vasserman Co-authored-by: Felix Hellmann Co-authored-by: Johannes Gäßler Co-authored-by: Lesaun Harvey --- .gitignore | 1 + Makefile | 2 + examples/server/CMakeLists.txt | 4 + examples/server/README.md | 318 ++---- examples/server/chat.mjs | 89 ++ examples/server/chat.sh | 77 ++ examples/server/server.cpp | 1681 +++++++++++++++++--------------- 7 files changed, 1169 insertions(+), 1003 deletions(-) create mode 100644 examples/server/chat.mjs create mode 100644 examples/server/chat.sh diff --git a/.gitignore b/.gitignore index b3ff6526c..e68fd724a 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,7 @@ models/* /train-text-from-scratch /benchmark-matmult /vdot +/server /Pipfile /libllama.so diff --git a/Makefile b/Makefile index 5306a114f..eee9eeb53 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,8 @@ BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-tex ifdef LLAMA_BUILD_SERVER BUILD_TARGETS += server + LLAMA_SERVER_VERBOSE ?= 1 +server: private CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) endif default: $(BUILD_TARGETS) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index bd65c84b1..07ba76ad3 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,6 +1,10 @@ set(TARGET server) +option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp json.hpp httplib.h) +target_compile_definitions(${TARGET} PRIVATE + SERVER_VERBOSE=$ +) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/server/README.md b/examples/server/README.md index 3b111655a..474a28b20 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -1,37 +1,74 @@ # llama.cpp/example/server -This example allow you to have a llama.cpp http server to interact from a web page or consume the API. +This example demonstrates a simple HTTP API server to interact with llama.cpp. -## Table of Contents +Command line options: -1. [Quick Start](#quick-start) -2. [Node JS Test](#node-js-test) -3. [API Endpoints](#api-endpoints) -4. [More examples](#more-examples) -5. [Common Options](#common-options) -6. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options) +- `--threads N`, `-t N`: Set the number of threads to use during computation. +- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). +- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. +- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. +- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. +- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. +- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS. +- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`. +- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. +- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. +- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. +- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. +- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. +- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. +- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. +- `--port`: Set the port to listen. Default: `8080`. + +## Build + +Build llama.cpp with server from repository root with either make or CMake. + +- Using `make`: + + ```bash + LLAMA_BUILD_SERVER=1 make + ``` + +- Using `CMake`: + + ```bash + mkdir build-server + cd build-server + cmake -DLLAMA_BUILD_SERVER=ON .. + cmake --build . --config Release + ``` ## Quick Start To get started right away, run the following command, making sure to use the correct path for the model you have: -#### Unix-based systems (Linux, macOS, etc.): -Make sure to build with the server option on -```bash -LLAMA_BUILD_SERVER=1 make -``` +### Unix-based systems (Linux, macOS, etc.): ```bash -./server -m models/7B/ggml-model.bin --ctx_size 2048 +./server -m models/7B/ggml-model.bin -c 2048 ``` -#### Windows: +### Windows: ```powershell -server.exe -m models\7B\ggml-model.bin --ctx_size 2048 +server.exe -m models\7B\ggml-model.bin -c 2048 ``` -That will start a server that by default listens on `127.0.0.1:8080`. You can consume the endpoints with Postman or NodeJS with axios library. +The above command will start a server that by default listens on `127.0.0.1:8080`. +You can consume the endpoints with Postman or NodeJS with axios library. + +## Testing with CURL + +Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS. + +```sh +curl --request POST \ + --url http://localhost:8080/completion \ + --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}' +``` ## Node JS Test @@ -54,7 +91,6 @@ const prompt = `Building a website can be done in 10 simple steps:`; async function Test() { let result = await axios.post("http://127.0.0.1:8080/completion", { prompt, - batch_size: 128, n_predict: 512, }); @@ -73,247 +109,75 @@ node . ## API Endpoints -You can interact with this API Endpoints. This implementations just support chat style interaction. +- **POST** `/completion`: Given a prompt, it returns the predicted completion. -- **POST** `hostname:port/completion`: Setting up the Llama Context to begin the completions tasks. + *Options:* -*Options:* + `temperature`: Adjust the randomness of the generated text (default: 0.8). -`batch_size`: Set the batch size for prompt processing (default: 512). + `top_k`: Limit the next token selection to the K most probable tokens (default: 40). -`temperature`: Adjust the randomness of the generated text (default: 0.8). + `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9). -`top_k`: Limit the next token selection to the K most probable tokens (default: 40). + `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. (default: 128, -1 = infinity). -`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9). + `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. + By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt. -`n_predict`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity). + `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. -`threads`: Set the number of threads to use during computation. + `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. -`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt. + `stop`: Specify a JSON array of stopping strings. + These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []). -`as_loop`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. + `tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled). -`interactive`: It allows interacting with the completion, and the completion stops as soon as it encounters a `stop word`. To enable this, set to `true`. + `typical_p`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled). -`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. + `repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1). -`stop`: Specify the words or characters that indicate a stop. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. + `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size). -`exclude`: Specify the words or characters you do not want to appear in the completion. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. + `penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true). -- **POST** `hostname:port/embedding`: Generate embedding of a given text + `presence_penalty`: Repeat alpha presence penalty (default: 0.0, 0.0 = disabled). -*Options:* + `frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled); -`content`: Set the text to get generate the embedding. + `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0). -`threads`: Set the number of threads to use during computation. + `mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0). -To use this endpoint, you need to start the server with the `--embedding` option added. + `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1). -- **POST** `hostname:port/tokenize`: Tokenize a given text + `seed`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed). -*Options:* + `ignore_eos`: Ignore end of stream token and continue generating (default: false). -`content`: Set the text to tokenize. + `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []). -- **GET** `hostname:port/next-token`: Receive the next token predicted, execute this request in a loop. Make sure set `as_loop` as `true` in the completion request. +- **POST** `/tokenize`: Tokenize a given text. -*Options:* + *Options:* -`stop`: Set `hostname:port/next-token?stop=true` to stop the token generation. + `content`: Set the text to tokenize. ## More examples ### Interactive mode -This mode allows interacting in a chat-like manner. It is recommended for models designed as assistants such as `Vicuna`, `WizardLM`, `Koala`, among others. Make sure to add the correct stop word for the corresponding model. +Check the sample in [chat.mjs](chat.mjs). +Run with NodeJS version 16 or later: -The prompt should be generated by you, according to the model's guidelines. You should keep adding the model's completions to the context as well. - -This example works well for `Vicuna - version 1`. - -```javascript -const axios = require("axios"); - -let prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. -### Human: Hello, Assistant. -### Assistant: Hello. How may I help you today? -### Human: Please tell me the largest city in Europe. -### Assistant: Sure. The largest city in Europe is Moscow, the capital of Russia.`; - -async function ChatCompletion(answer) { - // the user's next question to the prompt - prompt += `\n### Human: ${answer}\n` - - result = await axios.post("http://127.0.0.1:8080/completion", { - prompt, - batch_size: 128, - temperature: 0.2, - top_k: 40, - top_p: 0.9, - n_keep: -1, - n_predict: 2048, - stop: ["\n### Human:"], // when detect this, stop completion - exclude: ["### Assistant:"], // no show in the completion - threads: 8, - as_loop: true, // use this to request the completion token by token - interactive: true, // enable the detection of a stop word - }); - - // create a loop to receive every token predicted - // note: this operation is blocking, avoid use this in a ui thread - - let message = ""; - while (true) { - // you can stop the inference adding '?stop=true' like this http://127.0.0.1:8080/next-token?stop=true - result = await axios.get("http://127.0.0.1:8080/next-token"); - process.stdout.write(result.data.content); - message += result.data.content; - - // to avoid an infinite loop - if (result.data.stop) { - console.log("Completed"); - // make sure to add the completion to the prompt. - prompt += `### Assistant: ${message}`; - break; - } - } -} - -// This function should be called every time a question to the model is needed. -async function Test() { - // the server can't inference in paralell - await ChatCompletion("Write a long story about a time magician in a fantasy world"); - await ChatCompletion("Summary the story"); -} - -Test(); +```sh +node chat.mjs ``` -### Alpaca example +Another sample in [chat.sh](chat.sh). +Requires [bash](https://www.gnu.org/software/bash/), [curl](https://curl.se) and [jq](https://jqlang.github.io/jq/). +Run with bash: -**Temporaly note:** no tested, if you have the model, please test it and report me some issue - -```javascript -const axios = require("axios"); - -let prompt = `Below is an instruction that describes a task. Write a response that appropriately completes the request. -`; - -async function DoInstruction(instruction) { - prompt += `\n\n### Instruction:\n\n${instruction}\n\n### Response:\n\n`; - result = await axios.post("http://127.0.0.1:8080/completion", { - prompt, - batch_size: 128, - temperature: 0.2, - top_k: 40, - top_p: 0.9, - n_keep: -1, - n_predict: 2048, - stop: ["### Instruction:\n\n"], // when detect this, stop completion - exclude: [], // no show in the completion - threads: 8, - as_loop: true, // use this to request the completion token by token - interactive: true, // enable the detection of a stop word - }); - - // create a loop to receive every token predicted - // note: this operation is blocking, avoid use this in a ui thread - - let message = ""; - while (true) { - result = await axios.get("http://127.0.0.1:8080/next-token"); - process.stdout.write(result.data.content); - message += result.data.content; - - // to avoid an infinite loop - if (result.data.stop) { - console.log("Completed"); - // make sure to add the completion and the user's next question to the prompt. - prompt += message; - break; - } - } -} - -// This function should be called every time a instruction to the model is needed. -DoInstruction("Destroy the world"); // as joke +```sh +bash chat.sh ``` - -### Embeddings - -First, run the server with `--embedding` option: - -```bash -server -m models/7B/ggml-model.bin --ctx_size 2048 --embedding -``` - -Run this code in NodeJS: - -```javascript -const axios = require('axios'); - -async function Test() { - let result = await axios.post("http://127.0.0.1:8080/embedding", { - content: `Hello`, - threads: 5 - }); - // print the embedding array - console.log(result.data.embedding); -} - -Test(); -``` - -### Tokenize - -Run this code in NodeJS: - -```javascript -const axios = require('axios'); - -async function Test() { - let result = await axios.post("http://127.0.0.1:8080/tokenize", { - content: `Hello` - }); - // print the embedding array - console.log(result.data.tokens); -} - -Test(); -``` - -## Common Options - -- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). -- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. -- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. -- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. -- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS. -- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**. -- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`; -- `--port`: Set the port to listen. Default: `8080`. - -### RNG Seed - -- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed). - -The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run. - -## Performance Tuning and Memory Options - -### No Memory Mapping - -- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. - -### Memory Float 32 - -- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement but does not appear to increase generation quality in a measurable way. Not recommended. - -## Limitations: - -- The actual implementation of llama.cpp need a `llama-state` for handle multiple contexts and clients, but this could require more powerful hardware. diff --git a/examples/server/chat.mjs b/examples/server/chat.mjs new file mode 100644 index 000000000..8269e2592 --- /dev/null +++ b/examples/server/chat.mjs @@ -0,0 +1,89 @@ +import * as readline from 'node:readline' +import { stdin, stdout } from 'node:process' + +const API_URL = 'http://127.0.0.1:8080' + +const chat = [ + { + human: "Hello, Assistant.", + assistant: "Hello. How may I help you today?" + }, + { + human: "Please tell me the largest city in Europe.", + assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia." + }, +] + +const instruction = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.` + +function format_prompt(question) { + return `${instruction}\n${ + chat.map(m =>`### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n") + }\n### Human: ${question}\n### Assistant:` +} + +async function tokenize(content) { + const result = await fetch(`${API_URL}/tokenize`, { + method: 'POST', + body: JSON.stringify({ content }) + }) + + if (!result.ok) { + return [] + } + + return await result.json().tokens +} + +const n_keep = await tokenize(instruction).length + +async function chat_completion(question) { + const result = await fetch(`${API_URL}/completion`, { + method: 'POST', + body: JSON.stringify({ + prompt: format_prompt(question), + temperature: 0.2, + top_k: 40, + top_p: 0.9, + n_keep: n_keep, + n_predict: 256, + stop: ["\n### Human:"], // stop completion after generating this + stream: true, + }) + }) + + if (!result.ok) { + return + } + + let answer = '' + + for await (var chunk of result.body) { + const t = Buffer.from(chunk).toString('utf8') + if (t.startsWith('data: ')) { + const message = JSON.parse(t.substring(6)) + answer += message.content + process.stdout.write(message.content) + if (message.stop) { + if (message.truncated) { + chat.shift() + } + break + } + } + } + + process.stdout.write('\n') + chat.push({ human: question, assistant: answer.trimStart() }) +} + +const rl = readline.createInterface({ input: stdin, output: stdout }); + +const readlineQuestion = (rl, query, options) => new Promise((resolve, reject) => { + rl.question(query, options, resolve) +}); + +while(true) { + const question = await readlineQuestion(rl, '> ') + await chat_completion(question) +} diff --git a/examples/server/chat.sh b/examples/server/chat.sh new file mode 100644 index 000000000..a89f8e908 --- /dev/null +++ b/examples/server/chat.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +API_URL="${API_URL:-http://127.0.0.1:8080}" + +CHAT=( + "Hello, Assistant." + "Hello. How may I help you today?" + "Please tell me the largest city in Europe." + "Sure. The largest city in Europe is Moscow, the capital of Russia." +) + +INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." + +trim() { + shopt -s extglob + set -- "${1##+([[:space:]])}" + printf "%s" "${1%%+([[:space:]])}" +} + +trim_trailing() { + shopt -s extglob + printf "%s" "${1%%+([[:space:]])}" +} + +format_prompt() { + echo -n "${INSTRUCTION}" + printf "\n### Human: %s\n### Assistant: %s" "${CHAT[@]}" "$1" +} + +tokenize() { + curl \ + --silent \ + --request POST \ + --url "${API_URL}/tokenize" \ + --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \ + | jq '.tokens[]' +} + +N_KEEP=$(tokenize "${INSTRUCTION}" | wc -l) + +chat_completion() { + PROMPT="$(trim_trailing "$(format_prompt "$1")")" + DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{ + prompt: ., + temperature: 0.2, + top_k: 40, + top_p: 0.9, + n_keep: $n_keep, + n_predict: 256, + stop: ["\n### Human:"], + stream: true + }')" + + ANSWER='' + + while IFS= read -r LINE; do + if [[ $LINE = data:* ]]; then + CONTENT="$(echo "${LINE:5}" | jq -r '.content')" + printf "%s" "${CONTENT}" + ANSWER+="${CONTENT}" + fi + done < <(curl \ + --silent \ + --no-buffer \ + --request POST \ + --url "${API_URL}/completion" \ + --data-raw "${DATA}") + + printf "\n" + + CHAT+=("$1" "$(trim "$ANSWER")") +} + +while true; do + read -r -e -p "> " QUESTION + chat_completion "${QUESTION}" +done diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 872750053..12d4e2fa4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,799 +1,928 @@ -#include -#include #include "common.h" #include "llama.h" +#include "build-info.h" -struct server_params -{ - std::string hostname = "127.0.0.1"; - int32_t port = 8080; -}; +// single thread +#define CPPHTTPLIB_THREAD_POOL_COUNT 1 +#ifndef NDEBUG +// crash the server in debug mode, otherwise send an http 500 error +#define CPPHTTPLIB_NO_EXCEPTIONS 1 +#endif -struct llama_server_context -{ - bool as_loop = false; - bool has_next_token = false; - std::string generated_text = ""; +#include "httplib.h" +#include "json.hpp" - int32_t num_tokens_predicted = 0; - int32_t n_past = 0; - int32_t n_consumed = 0; - int32_t n_session_consumed = 0; - int32_t n_remain = 0; - - std::vector embd; - std::vector last_n_tokens; - std::vector processed_tokens; - std::vector llama_token_newline; - std::vector embd_inp; - std::vector> no_show_words; - std::vector tokens_predicted; - - llama_context *ctx; - gpt_params params; - - void rewind() { - as_loop = false; - params.antiprompt.clear(); - no_show_words.clear(); - num_tokens_predicted = 0; - generated_text = ""; - } - - bool loadModel(gpt_params params_) - { - params = params_; - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) - { - fprintf(stderr, "%s: error: unable to load model\n", __func__); - return false; - } - // determine newline token - llama_token_newline = ::llama_tokenize(ctx, "\n", false); - last_n_tokens.resize(params.n_ctx); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - return true; - } - - bool loadPrompt() { - params.prompt.insert(0, 1, ' '); // always add a first space - std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); - // compare the evaluated prompt with the new prompt - int new_prompt_len = 0; - for (size_t i = 0; i < prompt_tokens.size(); i++) { - if (i < processed_tokens.size() && - processed_tokens[i] == prompt_tokens[i]) - { - continue; - } - else - { - embd_inp.push_back(prompt_tokens[i]); - if(new_prompt_len == 0) { - if(int32_t(i) - 1 < n_past) { - processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end()); - } - // Evaluate the new fragment prompt from the last token processed. - n_past = processed_tokens.size(); - } - new_prompt_len ++; - } - } - if(n_past > 0 && params.interactive) { - n_remain -= new_prompt_len; - } - if ((int)embd_inp.size() > params.n_ctx - 4) - { - return false; - } - has_next_token = true; - return true; - } - - void beginCompletion() - { - if(n_remain == 0) { - // number of tokens to keep when resetting context - if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size()) - { - params.n_keep = (int)embd_inp.size(); - } - } - n_remain = params.n_predict; - } - - llama_token nextToken() { - llama_token result = -1; - if (embd.size() > 0) - { - if (n_past + (int)embd.size() > params.n_ctx) - { - // Reset context - const int n_left = n_past - params.n_keep; - n_past = std::max(1, params.n_keep); - processed_tokens.erase(processed_tokens.begin() + n_past, processed_tokens.end()); - embd.insert(embd.begin(), last_n_tokens.begin() + params.n_ctx - n_left / 2 - embd.size(), last_n_tokens.end() - embd.size()); - } - for (int i = 0; i < (int)embd.size(); i += params.n_batch) - { - int n_eval = (int)embd.size() - i; - if (n_eval > params.n_batch) - { - n_eval = params.n_batch; - } - if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) - { - fprintf(stderr, "%s : failed to eval\n", __func__); - has_next_token = false; - return result; - } - n_past += n_eval; - } - } - embd.clear(); - if ((int)embd_inp.size() <= n_consumed && has_next_token) - { - // out of user input, sample next token - const float temp = params.temp; - // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; - const float top_p = params.top_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; - const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - const float alpha_presence = params.presence_penalty; - const float alpha_frequency = params.frequency_penalty; - const int mirostat = params.mirostat; - const float mirostat_tau = params.mirostat_tau; - const float mirostat_eta = params.mirostat_eta; - const bool penalize_nl = params.penalize_nl; - llama_token id = 0; - { - auto logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); - - // Apply params.logit_bias map - for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) - { - logits[it->first] += it->second; - } - - std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) - { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false}; - - // Apply penalties - float nl_logit = logits[llama_token_nl()]; - auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx); - llama_sample_repetition_penalty(ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, repeat_penalty); - llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, alpha_frequency, alpha_presence); - if (!penalize_nl) - { - logits[llama_token_nl()] = nl_logit; - } - - if (temp <= 0) - { - // Greedy sampling - id = llama_sample_token_greedy(ctx, &candidates_p); - } - else - { - if (mirostat == 1) - { - static float mirostat_mu = 2.0f * mirostat_tau; - const int mirostat_m = 100; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); - } - else if (mirostat == 2) - { - static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); - } - else - { - // Temperature sampling - llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); - llama_sample_typical(ctx, &candidates_p, typical_p, 1); - llama_sample_top_p(ctx, &candidates_p, top_p, 1); - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token(ctx, &candidates_p); - } - } - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); - processed_tokens.push_back(id); - num_tokens_predicted++; - } - - // replace end of text token with newline token when in interactive mode - if (id == llama_token_eos() && params.interactive) - { - id = llama_token_newline.front(); - if (params.antiprompt.size() != 0) - { - // tokenize and inject first reverse prompt - const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); - embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); - } - } - - // add it to the context - embd.push_back(id); - for (auto id : embd) - { - result = id; - } - // decrement remaining sampling budget - --n_remain; - } - else - { - // some user input remains from prompt or interaction, forward it to processing - while ((int)embd_inp.size() > n_consumed) - { - embd.push_back(embd_inp[n_consumed]); - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(embd_inp[n_consumed]); - processed_tokens.push_back(embd_inp[n_consumed]); - ++n_consumed; - if ((int)embd.size() >= params.n_batch) - { - break; - } - } - } - if (params.interactive && (int)embd_inp.size() <= n_consumed) - { - // check for reverse prompt - if (params.antiprompt.size()) - { - std::string last_output; - for (auto id : last_n_tokens) - { - last_output += llama_token_to_str(ctx, id); - } - has_next_token = true; - // Check if each of the reverse prompts appears at the end of the output. - for (std::string &antiprompt : params.antiprompt) - { - if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) - { - has_next_token = false; - return result; - } - } - } - if (n_past > 0) - { - has_next_token = true; - } - } - - if (!embd.empty() && embd.back() == llama_token_eos()) { - has_next_token = false; - } - - if (params.interactive && n_remain <= 0 && params.n_predict != -1) - { - n_remain = params.n_predict; - } - has_next_token = n_remain != 0; - return result; - } - - std::string doCompletion() - { - llama_token token = nextToken(); - if (token == -1) { - return ""; - } - tokens_predicted.clear(); - tokens_predicted.push_back(token); - - // Avoid add the no show words to the response - for (std::vector word_tokens : no_show_words) - { - size_t match_token = 1; - if (tokens_predicted.front() == word_tokens.front()) - { - bool execute_matching = true; - if (tokens_predicted.size() > 1) { // if previus tokens had been tested - for (size_t i = 1; i < word_tokens.size(); i++) - { - if (i >= tokens_predicted.size()) { - match_token = i; - break; - } - if (tokens_predicted[i] == word_tokens[i]) - { - continue; - } - else - { - execute_matching = false; - break; - } - } - } - while (execute_matching) { - if (match_token == word_tokens.size()) { - return ""; - } - token = nextToken(); - tokens_predicted.push_back(token); - if (token == word_tokens[match_token]) - { // the token follow the sequence - match_token++; - } - else if (match_token < word_tokens.size()) - { // no complete all word sequence - break; - } - } - } - } - if(as_loop) { - generated_text = ""; - } - for (llama_token tkn : tokens_predicted) - { - generated_text += llama_token_to_str(ctx, tkn); - } - return generated_text; - } - - std::vector embedding(std::string content, int threads) { - content.insert(0, 1, ' '); - std::vector tokens = ::llama_tokenize(ctx, content, true); - if (tokens.size() > 0) - { - if (llama_eval(ctx, tokens.data(), tokens.size(), 0, threads)) - { - fprintf(stderr, "%s : failed to eval\n", __func__); - std::vector embeddings_; - return embeddings_; - } - } - const int n_embd = llama_n_embd(ctx); - const auto embeddings = llama_get_embeddings(ctx); - std::vector embeddings_(embeddings, embeddings + n_embd); - return embeddings_; - } -}; +#ifndef SERVER_VERBOSE +#define SERVER_VERBOSE 1 +#endif using namespace httplib; - using json = nlohmann::json; -void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms) -{ - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); - fprintf(stderr, " --embedding enable embedding mode\n"); - fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); - if (llama_mlock_supported()) - { - fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); - } - if (llama_mmap_supported()) - { - fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); - } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); - fprintf(stderr, " number of layers to store in VRAM\n"); - fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); - fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); - fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); -#endif - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, " -a ALIAS, --alias ALIAS\n"); - fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n"); - fprintf(stderr, " --host ip address to listen (default 127.0.0.1)\n"); - fprintf(stderr, " --port PORT port to listen (default 8080)\n"); - fprintf(stderr, "\n"); +struct server_params { + std::string hostname = "127.0.0.1"; + int32_t port = 8080; + int32_t read_timeout = 600; + int32_t write_timeout = 600; +}; + +static size_t common_part(const std::vector & a, const std::vector & b) { + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} + return i; } -bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_params ¶ms) -{ - gpt_params default_params; - std::string arg; - bool invalid_param = false; +enum stop_type { + STOP_FULL, + STOP_PARTIAL, +}; - for (int i = 1; i < argc; i++) - { - arg = argv[i]; - if (arg == "--port") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - sparams.port = std::stoi(argv[i]); - } - else if (arg == "--host") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - sparams.hostname = argv[i]; - } - else if (arg == "-s" || arg == "--seed") - { -#if defined(GGML_USE_CUBLAS) - fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n"); -#endif - if (++i >= argc) - { - invalid_param = true; - break; - } - params.seed = std::stoi(argv[i]); - } - else if (arg == "-m" || arg == "--model") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.model = argv[i]; - } - else if (arg == "-a" || arg == "--alias") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.model_alias = argv[i]; - } - else if (arg == "--embedding") - { - params.embedding = true; - } - else if (arg == "-h" || arg == "--help") - { - server_print_usage(argc, argv, default_params); - exit(0); - } - else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.n_ctx = std::stoi(argv[i]); - } - else if (arg == "--memory-f32" || arg == "--memory_f32") - { - params.memory_f16 = false; - } - else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") - { - if (++i >= argc) - { - invalid_param = true; - break; - } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - params.n_gpu_layers = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); -#endif - } - else if (arg == "--tensor-split" || arg == "-ts") - { - if (++i >= argc) - { - invalid_param = true; - break; - } -#ifdef GGML_USE_CUBLAS - std::string arg_next = argv[i]; - - // split string by , and / - const std::regex regex{R"([,/]+)"}; - std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; - std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - - for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) - { - if (i < split_arg.size()) - { - params.tensor_split[i] = std::stof(split_arg[i]); - } - else - { - params.tensor_split[i] = 0.0f; - } - } -#else - fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); -#endif // GGML_USE_CUBLAS - } - else if (arg == "--low-vram" || arg == "-lv") - { -#ifdef GGML_USE_CUBLAS - params.low_vram = true; -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); -#endif // GGML_USE_CUBLAS - } - else if (arg == "--main-gpu" || arg == "-mg") - { - if (++i >= argc) - { - invalid_param = true; - break; - } -#ifdef GGML_USE_CUBLAS - params.main_gpu = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); -#endif - } - else - { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - server_print_usage(argc, argv, default_params); - exit(1); - } - } - - if (invalid_param) - { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - server_print_usage(argc, argv, default_params); - exit(1); - } - return true; +static bool ends_with(const std::string & str, const std::string & suffix) { + return str.size() >= suffix.size() && + 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); } -bool parse_options_completion(json body, llama_server_context& llama, Response &res) { - if (!body["threads"].is_null()) - { - llama.params.n_threads = body["threads"].get(); - } - if (!body["n_predict"].is_null()) - { - llama.params.n_predict = body["n_predict"].get(); - } - if (!body["top_k"].is_null()) - { - llama.params.top_k = body["top_k"].get(); - } - if (!body["top_p"].is_null()) - { - llama.params.top_p = body["top_p"].get(); - } - if (!body["temperature"].is_null()) - { - llama.params.temp = body["temperature"].get(); - } - if (!body["batch_size"].is_null()) - { - llama.params.n_batch = body["batch_size"].get(); - } - if (!body["n_keep"].is_null()) - { - llama.params.n_keep = body["n_keep"].get(); - } - if (!body["as_loop"].is_null()) - { - llama.as_loop = body["as_loop"].get(); - } - if (!body["interactive"].is_null()) - { - llama.params.interactive = body["interactive"].get(); - } - if (!body["prompt"].is_null()) - { - llama.params.prompt = body["prompt"].get(); - } - else - { - json data = { - {"status", "error"}, - {"reason", "You need to pass the prompt"}}; - res.set_content(data.dump(), "application/json"); - res.status = 400; - return false; - } - if (!body["stop"].is_null()) - { - std::vector stop_words = body["stop"].get>(); - for (std::string stop_word : stop_words) - { - llama.params.antiprompt.push_back(stop_word); - llama.no_show_words.push_back(::llama_tokenize(llama.ctx, stop_word, false)); - } - } - if (!body["exclude"].is_null()) - { - std::vector no_show_words = body["exclude"].get>(); - for (std::string no_show : no_show_words) - { - llama.no_show_words.push_back(::llama_tokenize(llama.ctx, no_show, false)); - } - } - return true; -} - -int main(int argc, char **argv) -{ - // own arguments required by this example - gpt_params params; - server_params sparams; - - // struct that contains llama context and inference - llama_server_context llama; - params.model = "ggml-model.bin"; - - if (server_params_parse(argc, argv, sparams, params) == false) - { - return 1; - } - - if (params.seed <= 0) - { - params.seed = time(NULL); - } - - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); - - // load the model - if (!llama.loadModel(params)) - { - return 1; - } - - Server svr; - - svr.Get("/", [](const Request &, Response &res) - { res.set_content("

llama.cpp server works

", "text/html"); }); - - svr.Post("/completion", [&llama](const Request &req, Response &res) - { - if(llama.params.embedding) { - json data = { - {"status", "error"}, - {"reason", "To use completion function disable embedding mode"}}; - res.set_content(data.dump(), "application/json"); - res.status = 400; - return; - } - - llama.rewind(); - - if(parse_options_completion(json::parse(req.body), llama, res) == false){ - return; - } - - if (!llama.loadPrompt()) - { - json data = { - {"status", "error"}, - {"reason", "Context too long, please be more specific"}}; - res.set_content(data.dump(), "application/json"); - res.status = 400; - return; - } - - llama.beginCompletion(); - if(llama.as_loop) { - json data = { - {"status", "done" } }; - return res.set_content(data.dump(), "application/json"); - } else { - // loop inference until finish completion - while (llama.has_next_token) - { - llama.doCompletion(); +static size_t find_partial_stop_string(const std::string & stop, + const std::string & text) { + if (!text.empty() && !stop.empty()) { + const char text_last_char = text.back(); + for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) { + if (stop[char_index] == text_last_char) { + const std::string current_partial = stop.substr(0, char_index + 1); + if (ends_with(text, current_partial)) { + return text.size() - char_index - 1; } - try - { - json data = { - {"model", llama.params.model_alias }, - {"content", llama.generated_text }, - {"tokens_predicted", llama.num_tokens_predicted}}; - return res.set_content(data.dump(), "application/json"); - } - catch (const json::exception &e) - { - // Some tokens have bad UTF-8 strings, the json parser is very sensitive - json data = { - {"content", "Bad encoding token"}, - {"tokens_predicted", 0}}; - return res.set_content(data.dump(), "application/json"); - } - } }); - - svr.Post("/tokenize", [&llama](const Request &req, Response &res) - { - json body = json::parse(req.body); - json data = { - {"tokens", ::llama_tokenize(llama.ctx, body["content"].get(), false) } }; - return res.set_content(data.dump(), "application/json"); - }); - - svr.Post("/embedding", [&llama](const Request &req, Response &res) - { - if(!llama.params.embedding) { - std::vector empty; - json data = { - {"embedding", empty}}; - fprintf(stderr, "[llama-server] : You need enable embedding mode adding: --embedding option\n"); - return res.set_content(data.dump(), "application/json"); - } - json body = json::parse(req.body); - std::string content = body["content"].get(); - int threads = body["threads"].get(); - json data = { - {"embedding", llama.embedding(content, threads) } }; - return res.set_content(data.dump(), "application/json"); - }); - - svr.Get("/next-token", [&llama](const Request &req, Response &res) - { - if(llama.params.embedding) { - res.set_content("{}", "application/json"); - return; } - std::string result = ""; - if (req.has_param("stop")) { - llama.has_next_token = false; + } + } + return std::string::npos; +} + +template +static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { + std::string ret; + for (; begin != end; ++begin) { + ret += llama_token_to_str(ctx, *begin); + } + return ret; +} + +static void server_log(const char * level, const char * function, int line, + const char * message, const nlohmann::ordered_json & extra) { + nlohmann::ordered_json log { + { "timestamp", time(nullptr) }, + { "level", level }, + { "function", function }, + { "line", line }, + { "message", message }, + }; + + if (!extra.empty()) { + log.merge_patch(extra); + } + + const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); + fprintf(stdout, "%.*s\n", (int)str.size(), str.data()); + fflush(stdout); +} + +static bool server_verbose = false; + +#if SERVER_VERBOSE != 1 +# define LOG_VERBOSE(MSG, ...) +#else +# define LOG_VERBOSE(MSG, ...) \ + do { \ + if (server_verbose) { \ + server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ + } \ + } while(0) +#endif + +#define LOG_ERROR(MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_INFO(MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) + +struct llama_server_context { + bool stream = false; + bool has_next_token = false; + std::string generated_text; + + size_t num_tokens_predicted = 0; + size_t n_past = 0; + size_t n_remain = 0; + + std::vector embd; + std::vector last_n_tokens; + + llama_context * ctx = nullptr; + gpt_params params; + + bool truncated = false; + bool stopped_eos = false; + bool stopped_word = false; + bool stopped_limit = false; + std::string stopping_word; + int32_t multibyte_pending = 0; + + ~llama_server_context() { + if (ctx) { + llama_free(ctx); + ctx = nullptr; + } + } + + void rewind() { + params.antiprompt.clear(); + num_tokens_predicted = 0; + generated_text = ""; + generated_text.reserve(params.n_ctx); + truncated = false; + stopped_eos = false; + stopped_word = false; + stopped_limit = false; + stopping_word = ""; + multibyte_pending = 0; + + n_remain = 0; + n_past = 0; + } + + bool loadModel(const gpt_params & params_) { + params = params_; + ctx = llama_init_from_gpt_params(params); + if (ctx == nullptr) { + LOG_ERROR("unable to load model", { { "model", params_.model } }); + return false; + } + + last_n_tokens.resize(params.n_ctx); + std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); + return true; + } + + void loadPrompt() { + params.prompt.insert(0, 1, ' '); // always add a first space + std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); + + if (params.n_keep < 0) { + params.n_keep = (int)prompt_tokens.size(); + } + params.n_keep = std::min(params.n_ctx - 4, params.n_keep); + + // if input prompt is too big, truncate like normal + if (prompt_tokens.size() >= (size_t)params.n_ctx) { + const int n_left = (params.n_ctx - params.n_keep) / 2; + std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); + const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_left - 1) / n_left; + new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end()); + std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin()); + + LOG_VERBOSE("input truncated", { + { "n_ctx", params.n_ctx }, + { "n_keep", params.n_keep }, + { "n_left", n_left }, + { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) }, + }); + + truncated = true; + prompt_tokens = new_tokens; + } else { + const size_t ps = prompt_tokens.size(); + std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0); + std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps); + } + + // compare the evaluated prompt with the new prompt + n_past = common_part(embd, prompt_tokens); + embd = prompt_tokens; + if (n_past == prompt_tokens.size()) { + // we have to evaluate at least 1 token to generate logits. + n_past--; + } + + LOG_VERBOSE("prompt ingested", { + { "n_past", n_past }, + { "cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past) }, + { "to_eval", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()) }, + }); + + has_next_token = true; + } + + void beginCompletion() { + // number of tokens to keep when resetting context + n_remain = params.n_predict; + llama_set_rng_seed(ctx, params.seed); + } + + llama_token nextToken() { + llama_token result = -1; + + if (embd.size() >= (size_t)params.n_ctx) { + // Reset context + const int n_left = (params.n_ctx - params.n_keep) / 2; + + std::vector new_tokens(embd.begin(), embd.begin() + params.n_keep); + new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end()); + embd = new_tokens; + n_past = params.n_keep; + truncated = true; + LOG_VERBOSE("input truncated", { + { "n_ctx", params.n_ctx }, + { "n_keep", params.n_keep }, + { "n_left", n_left }, + { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) }, + }); + } + + while (n_past < embd.size()) { + int n_eval = (int)embd.size() - n_past; + if (n_eval > params.n_batch) { + n_eval = params.n_batch; + } + if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) { + LOG_ERROR("failed to eval", { + { "n_eval", n_eval }, + { "n_past", n_past }, + { "n_threads", params.n_threads }, + { "embd", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()) }, + }); + has_next_token = false; + return result; + } + n_past += n_eval; + } + + // out of user input, sample next token + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + const float top_p = params.top_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n; + const float repeat_penalty = params.repeat_penalty; + const float alpha_presence = params.presence_penalty; + const float alpha_frequency = params.frequency_penalty; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + const bool penalize_nl = params.penalize_nl; + llama_token id = 0; + + { + auto * logits = llama_get_logits(ctx); + auto n_vocab = llama_n_vocab(ctx); + + // Apply params.logit_bias map + for (const auto & it : params.logit_bias) { + logits[it.first] += it.second; + } + + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // Apply penalties + float nl_logit = logits[llama_token_nl()]; + auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx); + llama_sample_repetition_penalty(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, repeat_penalty); + llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, alpha_frequency, alpha_presence); + if (!penalize_nl) { + logits[llama_token_nl()] = nl_logit; + } + + if (temp <= 0) { + // Greedy sampling + id = llama_sample_token_greedy(ctx, &candidates_p); } else { - result = llama.doCompletion(); // inference next token + if (mirostat == 1) { + static float mirostat_mu = 2.0f * mirostat_tau; + const int mirostat_m = 100; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); + } else if (mirostat == 2) { + static float mirostat_mu = 2.0f * mirostat_tau; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); + } else { + // Temperature sampling + llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); + llama_sample_typical(ctx, &candidates_p, typical_p, 1); + llama_sample_top_p(ctx, &candidates_p, top_p, 1); + llama_sample_top_k(ctx, &candidates_p, top_k, 1); + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token(ctx, &candidates_p); + } } - try { - json data = { - {"content", result }, - {"stop", !llama.has_next_token }}; - return res.set_content(data.dump(), "application/json"); - } catch (const json::exception &e) { - // Some tokens have bad UTF-8 strings, the json parser is very sensitive - json data = { - {"content", "" }, - {"stop", !llama.has_next_token }}; - return res.set_content(data.dump(), "application/json"); + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(id); + num_tokens_predicted++; + } + + // add it to the context + embd.push_back(id); + result = id; + // decrement remaining sampling budget + --n_remain; + + if (!embd.empty() && embd.back() == llama_token_eos()) { + //stopping_word = llama_token_to_str(ctx, embd.back()); + has_next_token = false; + stopped_eos = true; + LOG_VERBOSE("eos token found", {}); + return result; + } + + has_next_token = params.n_predict == -1 || n_remain != 0; + return result; + } + + size_t findStoppingStrings(const std::string & text, const size_t last_token_size, + const stop_type type) { + size_t stop_pos = std::string::npos; + for (const std::string & word : params.antiprompt) { + size_t pos; + if (type == STOP_FULL) { + const size_t tmp = word.size() + last_token_size; + const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; + pos = text.find(word, from_pos); } - }); + else { + pos = find_partial_stop_string(word, text); + } + if (pos != std::string::npos && + (stop_pos == std::string::npos || pos < stop_pos)) { + if (type == STOP_FULL) { + stopping_word = word; + stopped_word = true; + has_next_token = false; + } + stop_pos = pos; + } + } + return stop_pos; + } - fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port); + std::string doCompletion() { + const llama_token token = nextToken(); - if(params.embedding) { - fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n"); - } + const std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token); + generated_text += token_text; - // change hostname and port - svr.listen(sparams.hostname, sparams.port); + if (multibyte_pending > 0) { + multibyte_pending -= token_text.size(); + } else if (token_text.size() == 1) { + const char c = token_text[0]; + // 2-byte characters: 110xxxxx 10xxxxxx + if ((c & 0xE0) == 0xC0) { + multibyte_pending = 1; + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF0) == 0xE0) { + multibyte_pending = 2; + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF8) == 0xF0) { + multibyte_pending = 3; + } else { + multibyte_pending = 0; + } + } + + if (multibyte_pending > 0 && !has_next_token) { + has_next_token = true; + n_remain++; + } + + if (!has_next_token && n_remain == 0) { + stopped_limit = true; + } + + LOG_VERBOSE("next token", { + { "token", token }, + { "token_text", llama_token_to_str(ctx, token) }, + { "has_next_token", has_next_token }, + { "n_remain", n_remain }, + { "num_tokens_predicted", num_tokens_predicted }, + { "stopped_eos", stopped_eos }, + { "stopped_word", stopped_word }, + { "stopped_limit", stopped_limit }, + { "stopping_word", stopping_word }, + }); + + return token_text; + } +}; + +static void server_print_usage(const char * argv0, const gpt_params & params, + const server_params & sparams) { + fprintf(stderr, "usage: %s [options]\n", argv0); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); + if (llama_mlock_supported()) { + fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + } + if (llama_mmap_supported()) { + fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); + fprintf(stderr, " number of layers to store in VRAM\n"); + fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); + fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); + fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); +#endif + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " -a ALIAS, --alias ALIAS\n"); + fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n"); + fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); + fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port); + fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); + fprintf(stderr, "\n"); +} + +static void server_params_parse(int argc, char ** argv, server_params & sparams, + gpt_params & params) { + gpt_params default_params; + server_params default_sparams; + std::string arg; + bool invalid_param = false; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg == "--port") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.port = std::stoi(argv[i]); + } else if (arg == "--host") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.hostname = argv[i]; + } else if (arg == "--timeout" || arg == "-to") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.read_timeout = std::stoi(argv[i]); + sparams.write_timeout = std::stoi(argv[i]); + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-a" || arg == "--alias") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model_alias = argv[i]; + } else if (arg == "-h" || arg == "--help") { + server_print_usage(argv[0], default_params, default_sparams); + exit(0); + } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_ctx = std::stoi(argv[i]); + } else if (arg == "--memory-f32" || arg == "--memory_f32") { + params.memory_f16 = false; + } else if (arg == "--threads" || arg == "-t") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); + params.n_batch = std::min(512, params.n_batch); + } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + params.n_gpu_layers = std::stoi(argv[i]); +#else + LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. " + "See main README.md for information on enabling GPU BLAS support", { { "n_gpu_layers", params.n_gpu_layers } }); +#endif + } + else if (arg == "--tensor-split" || arg == "-ts") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef GGML_USE_CUBLAS + std::string arg_next = argv[i]; + + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + + for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) { + if (i_device < split_arg.size()) { + params.tensor_split[i_device] = std::stof(split_arg[i_device]); + } + else { + params.tensor_split[i_device] = 0.0f; + } + } +#else + LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {}); +#endif // GGML_USE_CUBLAS + } + else if (arg == "--low-vram" || arg == "-lv") + { +#ifdef GGML_USE_CUBLAS + params.low_vram = true; +#else + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); +#endif // GGML_USE_CUBLAS + } + else if (arg == "--main-gpu" || arg == "-mg") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef GGML_USE_CUBLAS + params.main_gpu = std::stoi(argv[i]); +#else + LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {}); +#endif + } else if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter = argv[i]; + params.use_mmap = false; + } else if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_base = argv[i]; + } else if (arg == "-v" || arg == "--verbose") { +#if SERVER_VERBOSE != 1 + LOG_WARNING("server.cpp is not built with verbose logging.", {}); +#else + server_verbose = true; +#endif + } else if (arg == "--mlock") { + params.use_mlock = true; + } else if (arg == "--no-mmap") { + params.use_mmap = false; + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + server_print_usage(argv[0], default_params, default_sparams); + exit(1); + } + } + + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + server_print_usage(argv[0], default_params, default_sparams); + exit(1); + } +} + +static json format_generation_settings(llama_server_context & llama) { + const auto eos_bias = llama.params.logit_bias.find(llama_token_eos()); + const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && + eos_bias->second < 0.0f && std::isinf(eos_bias->second); + + return json { + { "seed", llama.params.seed }, + { "temp", llama.params.temp }, + { "top_k", llama.params.top_k }, + { "top_p", llama.params.top_p }, + { "tfs_z", llama.params.tfs_z }, + { "typical_p", llama.params.typical_p }, + { "repeat_last_n", llama.params.repeat_last_n }, + { "repeat_penalty", llama.params.repeat_penalty }, + { "presence_penalty", llama.params.presence_penalty }, + { "frequency_penalty", llama.params.frequency_penalty }, + { "mirostat", llama.params.mirostat }, + { "mirostat_tau", llama.params.mirostat_tau }, + { "mirostat_eta", llama.params.mirostat_eta }, + { "penalize_nl", llama.params.penalize_nl }, + { "stop", llama.params.antiprompt }, + { "n_predict", llama.params.n_predict }, + { "n_keep", llama.params.n_keep }, + { "ignore_eos", ignore_eos }, + { "stream", llama.stream }, + { "logit_bias", llama.params.logit_bias }, + }; +} + +static json format_final_response(llama_server_context & llama, const std::string & content) { + return json { + { "content", content }, + { "stop", true }, + { "model", llama.params.model_alias }, + { "tokens_predicted", llama.num_tokens_predicted }, + { "generation_settings", format_generation_settings(llama) }, + { "prompt", llama.params.prompt }, + { "truncated", llama.truncated }, + { "stopped_eos", llama.stopped_eos }, + { "stopped_word", llama.stopped_word }, + { "stopped_limit", llama.stopped_limit }, + { "stopping_word", llama.stopping_word }, + }; +} + +static json format_partial_response(const std::string & content) { + return json { + { "content", content }, + { "stop", false }, + }; +} + +static json format_tokenizer_response(const std::vector & tokens) { + return json { + { "tokens", tokens } + }; +} + +static void parse_options_completion(const json & body, llama_server_context & llama) { + gpt_params default_params; + + llama.stream = body.value("stream", false); + llama.params.n_predict = body.value("n_predict", default_params.n_predict); + llama.params.top_k = body.value("top_k", default_params.top_k); + llama.params.top_p = body.value("top_p", default_params.top_p); + llama.params.tfs_z = body.value("tfs_z", default_params.tfs_z); + llama.params.typical_p = body.value("typical_p", default_params.typical_p); + llama.params.repeat_last_n = body.value("repeat_last_n", default_params.repeat_last_n); + llama.params.temp = body.value("temperature", default_params.temp); + llama.params.repeat_penalty = body.value("repeat_penalty", default_params.repeat_penalty); + llama.params.presence_penalty = body.value("presence_penalty", default_params.presence_penalty); + llama.params.frequency_penalty = body.value("frequency_penalty", default_params.frequency_penalty); + llama.params.mirostat = body.value("mirostat", default_params.mirostat); + llama.params.mirostat_tau = body.value("mirostat_tau", default_params.mirostat_tau); + llama.params.mirostat_eta = body.value("mirostat_eta", default_params.mirostat_eta); + llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl); + llama.params.n_keep = body.value("n_keep", default_params.n_keep); + llama.params.seed = body.value("seed", default_params.seed); + llama.params.prompt = body.value("prompt", default_params.prompt); + + llama.params.logit_bias.clear(); + if (body.value("ignore_eos", false)) { + llama.params.logit_bias[llama_token_eos()] = -INFINITY; + } + + const auto & logit_bias = body.find("logit_bias"); + if (logit_bias != body.end() && logit_bias->is_array()) { + const int n_vocab = llama_n_vocab(llama.ctx); + for (const auto & el : *logit_bias) { + if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { + llama_token tok = el[0].get(); + if (tok >= 0 && tok < n_vocab) { + if (el[1].is_number()) { + llama.params.logit_bias[tok] = el[1].get(); + } else if (el[1].is_boolean() && !el[1].get()) { + llama.params.logit_bias[tok] = -INFINITY; + } + } + } + } + } + + llama.params.antiprompt.clear(); + const auto & stop = body.find("stop"); + if (stop != body.end() && stop->is_array()) { + for (const auto & word : *stop) { + if (!word.empty()) { + llama.params.antiprompt.push_back(word); + } + } + } + + LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama)); +} + +static void log_server_request(const Request & req, const Response & res) { + LOG_INFO("request", { + { "remote_addr", req.remote_addr }, + { "remote_port", req.remote_port }, + { "status", res.status }, + { "path", req.path }, + { "request", req.body }, + { "response", res.body }, + }); +} + +int main(int argc, char ** argv) { + // own arguments required by this example + gpt_params params; + server_params sparams; + + // struct that contains llama context and inference + llama_server_context llama; + + server_params_parse(argc, argv, sparams, params); + + if (params.model_alias == "unknown") { + params.model_alias = params.model; + } + + llama_init_backend(); + + LOG_INFO("build info", { + { "build", BUILD_NUMBER }, + { "commit", BUILD_COMMIT } + }); + LOG_INFO("system info", { + { "n_threads", params.n_threads }, + { "total_threads", std::thread::hardware_concurrency() }, + { "system_info", llama_print_system_info() }, + }); + + // load the model + if (!llama.loadModel(params)) { + return 1; + } + + Server svr; + + svr.set_default_headers({ + { "Access-Control-Allow-Origin", "*" }, + { "Access-Control-Allow-Headers", "content-type" } + }); + + svr.Get("/", [](const Request &, Response & res) { + res.set_content("

llama.cpp server works

", "text/html"); + }); + + svr.Post("/completion", [&llama](const Request & req, Response & res) { + llama.rewind(); + llama_reset_timings(llama.ctx); + + parse_options_completion(json::parse(req.body), llama); + + llama.loadPrompt(); + llama.beginCompletion(); + + if (!llama.stream) { + size_t stop_pos = std::string::npos; + + while (llama.has_next_token) { + const std::string token_text = llama.doCompletion(); + + stop_pos = llama.findStoppingStrings(llama.generated_text, + token_text.size(), STOP_FULL); + } + + if (stop_pos == std::string::npos) { + stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL); + } + if (stop_pos != std::string::npos) { + llama.generated_text.erase(llama.generated_text.begin() + stop_pos, + llama.generated_text.end()); + } + + const json data = format_final_response(llama, llama.generated_text); + + llama_print_timings(llama.ctx); + + res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), + "application/json"); + } else { + const auto chunked_content_provider = [&](size_t, DataSink & sink) { + size_t sent_count = 0; + + while (llama.has_next_token) { + const std::string token_text = llama.doCompletion(); + if (llama.multibyte_pending > 0) { + continue; + } + + size_t pos = std::min(sent_count, llama.generated_text.size()); + + const std::string str_test = llama.generated_text.substr(pos); + size_t stop_pos = + llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL); + if (stop_pos != std::string::npos) { + llama.generated_text.erase( + llama.generated_text.begin() + pos + stop_pos, + llama.generated_text.end()); + pos = std::min(sent_count, llama.generated_text.size()); + } else { + stop_pos = llama.findStoppingStrings(str_test, token_text.size(), + STOP_PARTIAL); + } + + const std::string to_send = llama.generated_text.substr(pos, stop_pos); + sent_count += to_send.size(); + + const json data = llama.has_next_token + ? format_partial_response(to_send) + // Generation is done, send extra information. + : format_final_response(llama, to_send); + + const std::string str = + "data: " + + data.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + + LOG_VERBOSE("data stream", { + { "to_send", str } + }); + + if (!sink.write(str.data(), str.size())) { + LOG_VERBOSE("stream closed", {}); + llama_print_timings(llama.ctx); + return false; + } + } + + llama_print_timings(llama.ctx); + sink.done(); + return true; + }; + res.set_chunked_content_provider("text/event-stream", chunked_content_provider); + } + }); + + svr.Options(R"(/.*)", [](const Request &, Response & res) { + return res.set_content("", "application/json"); + }); + + svr.Post("/tokenize", [&llama](const Request & req, Response & res) { + const json body = json::parse(req.body); + const std::string content = body["content"].get(); + const std::vector tokens = llama_tokenize(llama.ctx, content, false); + const json data = format_tokenizer_response(tokens); + return res.set_content(data.dump(), "application/json"); + }); + + svr.set_logger(log_server_request); + + svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) { + const auto * fmt = "500 Internal Server Error\n%s"; + char buf[BUFSIZ]; + try { + std::rethrow_exception(std::move(ep)); + } catch (std::exception & e) { + snprintf(buf, sizeof(buf), fmt, e.what()); + } catch (...) { + snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); + } + res.set_content(buf, "text/plain"); + res.status = 500; + }); + + // set timeouts and change hostname and port + svr.set_read_timeout(sparams.read_timeout); + svr.set_write_timeout(sparams.write_timeout); + + if (!svr.bind_to_port(sparams.hostname, sparams.port)) { + LOG_ERROR("couldn't bind to server socket", { + { "hostname", sparams.hostname }, + { "port", sparams.port }, + }); + return 1; + } + + LOG_INFO("HTTP server listening", { + { "hostname", sparams.hostname }, + { "port", sparams.port }, + }); + + if (!svr.listen_after_bind()) { + return 1; + } + + return 0; }