llama.cpp/examples/server/chat.mjs

import * as readline from 'node:readline'
import { stdin, stdout } from 'node:process'
import { readFileSync } from 'node:fs'
import { SchemaConverter }  from './public/json-schema-to-grammar.mjs'

const args = process.argv.slice(2);
const grammarJsonSchemaFile = args.find(
    (_, index) => args[index - 1] === "--grammar-json-schema"
);

const no_cached_prompt = args.find(
    (_, index) => args[index - 1] === "--no-cache-prompt"
) ?? "false";

const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");

// Example usage: function,arguments
const grammarJsonSchemaPropOrder = args.find(
    (_, index) => args[index - 1] === "--grammar-json-schema-prop-order"
);
const propOrder = grammarJsonSchemaPropOrder
    ? grammarJsonSchemaPropOrder
          .split(",")
          .reduce((acc, cur, index) => ({ ...acc, [cur]: index }), {})
    : {};

let grammar = null
if (grammarJsonSchemaFile) {
    const schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
    const converter = new SchemaConverter(propOrder)
    converter.visit(schema, '')
    grammar = converter.formatGrammar()
}
if (grammarFile) {
    grammar = readFileSync(grammarFile, 'utf-8')
}

// for cached prompt
let slot_id = -1;

const API_URL = 'http://127.0.0.1:8080'

const chat = [
    {
        human: "Hello, Assistant.",
        assistant: "Hello. How may I help you today?"
    },
    {
        human: "Please tell me the largest city in Europe.",
        assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia."
    },
]

const instruction = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.`

function format_prompt(question) {
    return `${instruction}\n${
        chat.map(m =>`### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n")
    }\n### Human: ${question}\n### Assistant:`
}

async function tokenize(content) {
    const result = await fetch(`${API_URL}/tokenize`, {
        method: 'POST',
        body: JSON.stringify({ content })
    })

    if (!result.ok) {
        return []
    }

    return await result.json().tokens
}

const n_keep = await tokenize(instruction).length

async function chat_completion(question) {
    const result = await fetch(`${API_URL}/completion`, {
        method: 'POST',
        body: JSON.stringify({
            prompt: format_prompt(question),
            temperature: 0.2,
            top_k: 40,
            top_p: 0.9,
            n_keep: n_keep,
            n_predict: 256,
            cache_prompt: no_cached_prompt === "false",
            slot_id: slot_id,
            stop: ["\n### Human:"], // stop completion after generating this
            grammar,
            stream: true,
        })
    })

    if (!result.ok) {
        return
    }

    let answer = ''

    for await (var chunk of result.body) {
        const t = Buffer.from(chunk).toString('utf8')
        if (t.startsWith('data: ')) {
            const message = JSON.parse(t.substring(6))
            slot_id = message.slot_id
            answer += message.content
            process.stdout.write(message.content)
            if (message.stop) {
                if (message.truncated) {
                    chat.shift()
                }
                break
            }
        }
    }

    process.stdout.write('\n')
    chat.push({ human: question, assistant: answer.trimStart() })
}

const rl = readline.createInterface({ input: stdin, output: stdout });

const readlineQuestion = (rl, query, options) => new Promise((resolve, reject) => {
    rl.question(query, options, resolve)
});

while(true) {
    const question = await readlineQuestion(rl, '> ')
    await chat_completion(question)
}
Server Example Refactor and Improvements (#1570) A major rewrite for the server example. Note that if you have built something on the previous server API, it will probably be incompatible. Check out the examples for how a typical chat app could work. This took a lot of effort, there are 24 PR's closed in the submitter's repo alone, over 160 commits and a lot of comments and testing. Summary of the changes: - adds missing generation parameters: tfs_z, typical_p, repeat_last_n, repeat_penalty, presence_penalty, frequency_penalty, mirostat, penalize_nl, seed, ignore_eos - applies missing top k sampler - removes interactive mode/terminal-like behavior, removes exclude parameter - moves threads and batch size to server command-line parameters - adds LoRA loading and matches command line parameters with main example - fixes stopping on EOS token and with the specified token amount with n_predict - adds server timeouts, host, and port settings - adds expanded generation complete response; adds generation settings, stop reason, prompt truncated, model used, and final text - sets defaults for unspecified parameters between requests - removes /next-token endpoint and as_loop parameter, adds stream parameter and server-sent events for streaming - adds CORS headers to responses - adds request logging, exception printing and optional verbose logging - adds better stopping words handling when matching multiple tokens and while streaming, or when it finishes on a partial stop string - adds printing an error when it can't bind to the host/port specified - fixes multi-byte character handling and replaces invalid UTF-8 characters on responses - prints timing and build info on startup - adds logit bias to request parameters - removes embedding mode - updates documentation; adds streaming Node.js and Bash examples - fixes code formatting - sets server threads to 1 since the current global state doesn't work well with simultaneous requests - adds truncation of the input prompt and better context reset - removes token limit from the input prompt - significantly simplified the logic and removed a lot of variables --------- Co-authored-by: anon998 <131767832+anon998@users.noreply.github.com> Co-authored-by: Henri Vasserman <henv@hot.ee> Co-authored-by: Felix Hellmann <privat@cirk2.de> Co-authored-by: Johannes Gäßler <johannesg@5d6.de> Co-authored-by: Lesaun Harvey <Lesaun@gmail.com> 2023-06-17 11:53:04 +00:00			`import * as readline from 'node:readline'`
			`import { stdin, stdout } from 'node:process'`
server : implement json-schema-to-grammar.mjs & add grammar param in the UI (#2588) * server : implement json-schema-to-grammar.mjs by follow python impl * server : add grammar support in chat.mjs * server : implement grammer param in the UI * server : generate .hpp * server : remove trailing whitespaces * server : generate .hpp * server : fix sort of prop pairs * server : optimize regex & iteration 2023-08-14 07:16:54 +00:00			`import { readFileSync } from 'node:fs'`
			`import { SchemaConverter } from './public/json-schema-to-grammar.mjs'`

			`const args = process.argv.slice(2);`
			`const grammarJsonSchemaFile = args.find(`
			`(_, index) => args[index - 1] === "--grammar-json-schema"`
			`);`
chat.mjs support cached prompt + some fixes 2023-10-13 15:06:41 +00:00
			`const no_cached_prompt = args.find(`
			`(_, index) => args[index - 1] === "--no-cache-prompt"`
			`) ?? "false";`

server : implement json-schema-to-grammar.mjs & add grammar param in the UI (#2588) * server : implement json-schema-to-grammar.mjs by follow python impl * server : add grammar support in chat.mjs * server : implement grammer param in the UI * server : generate .hpp * server : remove trailing whitespaces * server : generate .hpp * server : fix sort of prop pairs * server : optimize regex & iteration 2023-08-14 07:16:54 +00:00			`const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");`

			`// Example usage: function,arguments`
			`const grammarJsonSchemaPropOrder = args.find(`
			`(_, index) => args[index - 1] === "--grammar-json-schema-prop-order"`
			`);`
			`const propOrder = grammarJsonSchemaPropOrder`
			`? grammarJsonSchemaPropOrder`
			`.split(",")`
			`.reduce((acc, cur, index) => ({ ...acc, [cur]: index }), {})`
			`: {};`

			`let grammar = null`
			`if (grammarJsonSchemaFile) {`
			`const schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))`
			`const converter = new SchemaConverter(propOrder)`
			`converter.visit(schema, '')`
			`grammar = converter.formatGrammar()`
			`}`
			`if (grammarFile) {`
			`grammar = readFileSync(grammarFile, 'utf-8')`
			`}`
Server Example Refactor and Improvements (#1570) A major rewrite for the server example. Note that if you have built something on the previous server API, it will probably be incompatible. Check out the examples for how a typical chat app could work. This took a lot of effort, there are 24 PR's closed in the submitter's repo alone, over 160 commits and a lot of comments and testing. Summary of the changes: - adds missing generation parameters: tfs_z, typical_p, repeat_last_n, repeat_penalty, presence_penalty, frequency_penalty, mirostat, penalize_nl, seed, ignore_eos - applies missing top k sampler - removes interactive mode/terminal-like behavior, removes exclude parameter - moves threads and batch size to server command-line parameters - adds LoRA loading and matches command line parameters with main example - fixes stopping on EOS token and with the specified token amount with n_predict - adds server timeouts, host, and port settings - adds expanded generation complete response; adds generation settings, stop reason, prompt truncated, model used, and final text - sets defaults for unspecified parameters between requests - removes /next-token endpoint and as_loop parameter, adds stream parameter and server-sent events for streaming - adds CORS headers to responses - adds request logging, exception printing and optional verbose logging - adds better stopping words handling when matching multiple tokens and while streaming, or when it finishes on a partial stop string - adds printing an error when it can't bind to the host/port specified - fixes multi-byte character handling and replaces invalid UTF-8 characters on responses - prints timing and build info on startup - adds logit bias to request parameters - removes embedding mode - updates documentation; adds streaming Node.js and Bash examples - fixes code formatting - sets server threads to 1 since the current global state doesn't work well with simultaneous requests - adds truncation of the input prompt and better context reset - removes token limit from the input prompt - significantly simplified the logic and removed a lot of variables --------- Co-authored-by: anon998 <131767832+anon998@users.noreply.github.com> Co-authored-by: Henri Vasserman <henv@hot.ee> Co-authored-by: Felix Hellmann <privat@cirk2.de> Co-authored-by: Johannes Gäßler <johannesg@5d6.de> Co-authored-by: Lesaun Harvey <Lesaun@gmail.com> 2023-06-17 11:53:04 +00:00
chat.mjs support cached prompt + some fixes 2023-10-13 15:06:41 +00:00			`// for cached prompt`
			`let slot_id = -1;`

Server Example Refactor and Improvements (#1570) A major rewrite for the server example. Note that if you have built something on the previous server API, it will probably be incompatible. Check out the examples for how a typical chat app could work. This took a lot of effort, there are 24 PR's closed in the submitter's repo alone, over 160 commits and a lot of comments and testing. Summary of the changes: - adds missing generation parameters: tfs_z, typical_p, repeat_last_n, repeat_penalty, presence_penalty, frequency_penalty, mirostat, penalize_nl, seed, ignore_eos - applies missing top k sampler - removes interactive mode/terminal-like behavior, removes exclude parameter - moves threads and batch size to server command-line parameters - adds LoRA loading and matches command line parameters with main example - fixes stopping on EOS token and with the specified token amount with n_predict - adds server timeouts, host, and port settings - adds expanded generation complete response; adds generation settings, stop reason, prompt truncated, model used, and final text - sets defaults for unspecified parameters between requests - removes /next-token endpoint and as_loop parameter, adds stream parameter and server-sent events for streaming - adds CORS headers to responses - adds request logging, exception printing and optional verbose logging - adds better stopping words handling when matching multiple tokens and while streaming, or when it finishes on a partial stop string - adds printing an error when it can't bind to the host/port specified - fixes multi-byte character handling and replaces invalid UTF-8 characters on responses - prints timing and build info on startup - adds logit bias to request parameters - removes embedding mode - updates documentation; adds streaming Node.js and Bash examples - fixes code formatting - sets server threads to 1 since the current global state doesn't work well with simultaneous requests - adds truncation of the input prompt and better context reset - removes token limit from the input prompt - significantly simplified the logic and removed a lot of variables --------- Co-authored-by: anon998 <131767832+anon998@users.noreply.github.com> Co-authored-by: Henri Vasserman <henv@hot.ee> Co-authored-by: Felix Hellmann <privat@cirk2.de> Co-authored-by: Johannes Gäßler <johannesg@5d6.de> Co-authored-by: Lesaun Harvey <Lesaun@gmail.com> 2023-06-17 11:53:04 +00:00			`const API_URL = 'http://127.0.0.1:8080'`

			`const chat = [`
			`{`
			`human: "Hello, Assistant.",`
			`assistant: "Hello. How may I help you today?"`
			`},`
			`{`
			`human: "Please tell me the largest city in Europe.",`
			`assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia."`
			`},`
			`]`

			const instruction = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.`

			`function format_prompt(question) {`
			return `${instruction}\n${
			chat.map(m =>`### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n")
			}\n### Human: ${question}\n### Assistant:`
			`}`

			`async function tokenize(content) {`
			const result = await fetch(`${API_URL}/tokenize`, {
			`method: 'POST',`
			`body: JSON.stringify({ content })`
			`})`

			`if (!result.ok) {`
			`return []`
			`}`

			`return await result.json().tokens`
			`}`

			`const n_keep = await tokenize(instruction).length`

			`async function chat_completion(question) {`
			const result = await fetch(`${API_URL}/completion`, {
			`method: 'POST',`
			`body: JSON.stringify({`
			`prompt: format_prompt(question),`
			`temperature: 0.2,`
			`top_k: 40,`
			`top_p: 0.9,`
			`n_keep: n_keep,`
			`n_predict: 256,`
chat.mjs support cached prompt + some fixes 2023-10-13 15:06:41 +00:00			`cache_prompt: no_cached_prompt === "false",`
			`slot_id: slot_id,`
fixed premature end due stop word 2023-10-16 16:36:05 +00:00			`stop: ["\n### Human:"], // stop completion after generating this`
server : implement json-schema-to-grammar.mjs & add grammar param in the UI (#2588) * server : implement json-schema-to-grammar.mjs by follow python impl * server : add grammar support in chat.mjs * server : implement grammer param in the UI * server : generate .hpp * server : remove trailing whitespaces * server : generate .hpp * server : fix sort of prop pairs * server : optimize regex & iteration 2023-08-14 07:16:54 +00:00			`grammar,`
Server Example Refactor and Improvements (#1570) A major rewrite for the server example. Note that if you have built something on the previous server API, it will probably be incompatible. Check out the examples for how a typical chat app could work. This took a lot of effort, there are 24 PR's closed in the submitter's repo alone, over 160 commits and a lot of comments and testing. Summary of the changes: - adds missing generation parameters: tfs_z, typical_p, repeat_last_n, repeat_penalty, presence_penalty, frequency_penalty, mirostat, penalize_nl, seed, ignore_eos - applies missing top k sampler - removes interactive mode/terminal-like behavior, removes exclude parameter - moves threads and batch size to server command-line parameters - adds LoRA loading and matches command line parameters with main example - fixes stopping on EOS token and with the specified token amount with n_predict - adds server timeouts, host, and port settings - adds expanded generation complete response; adds generation settings, stop reason, prompt truncated, model used, and final text - sets defaults for unspecified parameters between requests - removes /next-token endpoint and as_loop parameter, adds stream parameter and server-sent events for streaming - adds CORS headers to responses - adds request logging, exception printing and optional verbose logging - adds better stopping words handling when matching multiple tokens and while streaming, or when it finishes on a partial stop string - adds printing an error when it can't bind to the host/port specified - fixes multi-byte character handling and replaces invalid UTF-8 characters on responses - prints timing and build info on startup - adds logit bias to request parameters - removes embedding mode - updates documentation; adds streaming Node.js and Bash examples - fixes code formatting - sets server threads to 1 since the current global state doesn't work well with simultaneous requests - adds truncation of the input prompt and better context reset - removes token limit from the input prompt - significantly simplified the logic and removed a lot of variables --------- Co-authored-by: anon998 <131767832+anon998@users.noreply.github.com> Co-authored-by: Henri Vasserman <henv@hot.ee> Co-authored-by: Felix Hellmann <privat@cirk2.de> Co-authored-by: Johannes Gäßler <johannesg@5d6.de> Co-authored-by: Lesaun Harvey <Lesaun@gmail.com> 2023-06-17 11:53:04 +00:00			`stream: true,`
			`})`
			`})`

			`if (!result.ok) {`
			`return`
			`}`

			`let answer = ''`

			`for await (var chunk of result.body) {`
			`const t = Buffer.from(chunk).toString('utf8')`
			`if (t.startsWith('data: ')) {`
			`const message = JSON.parse(t.substring(6))`
chat.mjs support cached prompt + some fixes 2023-10-13 15:06:41 +00:00			`slot_id = message.slot_id`
Server Example Refactor and Improvements (#1570) A major rewrite for the server example. Note that if you have built something on the previous server API, it will probably be incompatible. Check out the examples for how a typical chat app could work. This took a lot of effort, there are 24 PR's closed in the submitter's repo alone, over 160 commits and a lot of comments and testing. Summary of the changes: - adds missing generation parameters: tfs_z, typical_p, repeat_last_n, repeat_penalty, presence_penalty, frequency_penalty, mirostat, penalize_nl, seed, ignore_eos - applies missing top k sampler - removes interactive mode/terminal-like behavior, removes exclude parameter - moves threads and batch size to server command-line parameters - adds LoRA loading and matches command line parameters with main example - fixes stopping on EOS token and with the specified token amount with n_predict - adds server timeouts, host, and port settings - adds expanded generation complete response; adds generation settings, stop reason, prompt truncated, model used, and final text - sets defaults for unspecified parameters between requests - removes /next-token endpoint and as_loop parameter, adds stream parameter and server-sent events for streaming - adds CORS headers to responses - adds request logging, exception printing and optional verbose logging - adds better stopping words handling when matching multiple tokens and while streaming, or when it finishes on a partial stop string - adds printing an error when it can't bind to the host/port specified - fixes multi-byte character handling and replaces invalid UTF-8 characters on responses - prints timing and build info on startup - adds logit bias to request parameters - removes embedding mode - updates documentation; adds streaming Node.js and Bash examples - fixes code formatting - sets server threads to 1 since the current global state doesn't work well with simultaneous requests - adds truncation of the input prompt and better context reset - removes token limit from the input prompt - significantly simplified the logic and removed a lot of variables --------- Co-authored-by: anon998 <131767832+anon998@users.noreply.github.com> Co-authored-by: Henri Vasserman <henv@hot.ee> Co-authored-by: Felix Hellmann <privat@cirk2.de> Co-authored-by: Johannes Gäßler <johannesg@5d6.de> Co-authored-by: Lesaun Harvey <Lesaun@gmail.com> 2023-06-17 11:53:04 +00:00			`answer += message.content`
			`process.stdout.write(message.content)`
			`if (message.stop) {`
			`if (message.truncated) {`
			`chat.shift()`
			`}`
			`break`
			`}`
			`}`
			`}`

			`process.stdout.write('\n')`
			`chat.push({ human: question, assistant: answer.trimStart() })`
			`}`

			`const rl = readline.createInterface({ input: stdin, output: stdout });`

			`const readlineQuestion = (rl, query, options) => new Promise((resolve, reject) => {`
			`rl.question(query, options, resolve)`
			`});`

			`while(true) {`
			`const question = await readlineQuestion(rl, '> ')`
			`await chat_completion(question)`
			`}`