llama.cpp/examples/server/chat.mjs
2023-10-16 12:36:05 -04:00

131 lines
3.7 KiB
JavaScript

import * as readline from 'node:readline'
import { stdin, stdout } from 'node:process'
import { readFileSync } from 'node:fs'
import { SchemaConverter } from './public/json-schema-to-grammar.mjs'
const args = process.argv.slice(2);
const grammarJsonSchemaFile = args.find(
(_, index) => args[index - 1] === "--grammar-json-schema"
);
const no_cached_prompt = args.find(
(_, index) => args[index - 1] === "--no-cache-prompt"
) ?? "false";
const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
// Example usage: function,arguments
const grammarJsonSchemaPropOrder = args.find(
(_, index) => args[index - 1] === "--grammar-json-schema-prop-order"
);
const propOrder = grammarJsonSchemaPropOrder
? grammarJsonSchemaPropOrder
.split(",")
.reduce((acc, cur, index) => ({ ...acc, [cur]: index }), {})
: {};
let grammar = null
if (grammarJsonSchemaFile) {
const schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
const converter = new SchemaConverter(propOrder)
converter.visit(schema, '')
grammar = converter.formatGrammar()
}
if (grammarFile) {
grammar = readFileSync(grammarFile, 'utf-8')
}
// for cached prompt
let slot_id = -1;
const API_URL = 'http://127.0.0.1:8080'
const chat = [
{
human: "Hello, Assistant.",
assistant: "Hello. How may I help you today?"
},
{
human: "Please tell me the largest city in Europe.",
assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia."
},
]
const instruction = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.`
function format_prompt(question) {
return `${instruction}\n${
chat.map(m =>`### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n")
}\n### Human: ${question}\n### Assistant:`
}
async function tokenize(content) {
const result = await fetch(`${API_URL}/tokenize`, {
method: 'POST',
body: JSON.stringify({ content })
})
if (!result.ok) {
return []
}
return await result.json().tokens
}
const n_keep = await tokenize(instruction).length
async function chat_completion(question) {
const result = await fetch(`${API_URL}/completion`, {
method: 'POST',
body: JSON.stringify({
prompt: format_prompt(question),
temperature: 0.2,
top_k: 40,
top_p: 0.9,
n_keep: n_keep,
n_predict: 256,
cache_prompt: no_cached_prompt === "false",
slot_id: slot_id,
stop: ["\n### Human:"], // stop completion after generating this
grammar,
stream: true,
})
})
if (!result.ok) {
return
}
let answer = ''
for await (var chunk of result.body) {
const t = Buffer.from(chunk).toString('utf8')
if (t.startsWith('data: ')) {
const message = JSON.parse(t.substring(6))
slot_id = message.slot_id
answer += message.content
process.stdout.write(message.content)
if (message.stop) {
if (message.truncated) {
chat.shift()
}
break
}
}
}
process.stdout.write('\n')
chat.push({ human: question, assistant: answer.trimStart() })
}
const rl = readline.createInterface({ input: stdin, output: stdout });
const readlineQuestion = (rl, query, options) => new Promise((resolve, reject) => {
rl.question(query, options, resolve)
});
while(true) {
const question = await readlineQuestion(rl, '> ')
await chat_completion(question)
}