llama.cpp/examples/server/public/completion.js

const paramDefaults = {
  stream: true,
  n_predict: 500,
  temperature: 0.2,
  stop: ["</s>"]
};

let generation_settings = null;


// Completes the prompt as a generator. Recommended for most use cases.
//
// Example:
//
//    import { llama } from '/completion.js'
//
//    const request = llama("Tell me a joke", {n_predict: 800})
//    for await (const chunk of request) {
//      document.write(chunk.data.content)
//    }
//
export async function* llama(prompt, params = {}, config = {}) {
  let controller = config.controller;

  if (!controller) {
    controller = new AbortController();
  }

  const completionParams = { ...paramDefaults, ...params, prompt };

  const response = await fetch("/completion", {
    method: 'POST',
    body: JSON.stringify(completionParams),
    headers: {
      'Connection': 'keep-alive',
      'Content-Type': 'application/json',
      'Accept': 'text/event-stream',
      ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
    },
    signal: controller.signal,
  });

  const reader = response.body.getReader();
  const decoder = new TextDecoder();

  let content = "";
  let leftover = ""; // Buffer for partially read lines

  try {
    let cont = true;

    while (cont) {
      const result = await reader.read();
      if (result.done) {
        break;
      }

      // Add any leftover data to the current chunk of data
      const text = leftover + decoder.decode(result.value);

      // Check if the last character is a line break
      const endsWithLineBreak = text.endsWith('\n');

      // Split the text into lines
      let lines = text.split('\n');

      // If the text doesn't end with a line break, then the last line is incomplete
      // Store it in leftover to be added to the next chunk of data
      if (!endsWithLineBreak) {
        leftover = lines.pop();
      } else {
        leftover = ""; // Reset leftover if we have a line break at the end
      }

      // Parse all sse events and add them to result
      const regex = /^(\S+):\s(.*)$/gm;
      for (const line of lines) {
        const match = regex.exec(line);
        if (match) {
          result[match[1]] = match[2]
          // since we know this is llama.cpp, let's just decode the json in data
          if (result.data) {
            result.data = JSON.parse(result.data);
            content += result.data.content;

            // yield
            yield result;

            // if we got a stop token from server, we will break here
            if (result.data.stop) {
              if (result.data.generation_settings) {
                generation_settings = result.data.generation_settings;
              }
              cont = false;
              break;
            }
          }
          if (result.error) {
            result.error = JSON.parse(result.error);
            if (result.error.content.includes('slot unavailable')) {
              // Throw an error to be caught by upstream callers
              throw new Error('slot unavailable');
            } else {
              console.error(`llama.cpp error: ${result.error.content}`);
            }
          }
          if (result.error) {
            result.error = JSON.parse(result.error);
            console.error(`llama.cpp error: ${result.error.content}`);
          }
        }
      }
    }
  } catch (e) {
    if (e.name !== 'AbortError') {
      console.error("llama error: ", e);
    }
    throw e;
  }
  finally {
    controller.abort();
  }

  return content;
}

// Call llama, return an event target that you can subscribe to
//
// Example:
//
//    import { llamaEventTarget } from '/completion.js'
//
//    const conn = llamaEventTarget(prompt)
//    conn.addEventListener("message", (chunk) => {
//      document.write(chunk.detail.content)
//    })
//
export const llamaEventTarget = (prompt, params = {}, config = {}) => {
  const eventTarget = new EventTarget();
  (async () => {
    let content = "";
    for await (const chunk of llama(prompt, params, config)) {
      if (chunk.data) {
        content += chunk.data.content;
        eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
      }
      if (chunk.data.generation_settings) {
        eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
      }
      if (chunk.data.timings) {
        eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
      }
    }
    eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
  })();
  return eventTarget;
}

// Call llama, return a promise that resolves to the completed text. This does not support streaming
//
// Example:
//
//     llamaPromise(prompt).then((content) => {
//       document.write(content)
//     })
//
//     or
//
//     const content = await llamaPromise(prompt)
//     document.write(content)
//
export const llamaPromise = (prompt, params = {}, config = {}) => {
  return new Promise(async (resolve, reject) => {
    let content = "";
    try {
      for await (const chunk of llama(prompt, params, config)) {
        content += chunk.data.content;
      }
      resolve(content);
    } catch (error) {
      reject(error);
    }
  });
};

/**
 * (deprecated)
 */
export const llamaComplete = async (params, controller, callback) => {
  for await (const chunk of llama(params.prompt, params, { controller })) {
    callback(chunk);
  }
}

// Get the model info from the server. This is useful for getting the context window and so on.
export const llamaModelInfo = async () => {
  if (!generation_settings) {
    generation_settings = await fetch("/model.json").then(r => r.json());
  }
  return generation_settings;
}
Simple webchat for server (#1998) * expose simple web interface on root domain * embed index and add --path for choosing static dir * allow server to multithread because web browsers send a lot of garbage requests we want the server to multithread when serving 404s for favicon's etc. To avoid blowing up llama we just take a mutex when it's invoked. * let's try this with the xxd tool instead and see if msvc is happier with that * enable server in Makefiles * add /completion.js file to make it easy to use the server from js * slightly nicer css * rework state management into session, expose historyTemplate to settings --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-04 14:05:27 +00:00			`const paramDefaults = {`
			`stream: true,`
			`n_predict: 500,`
			`temperature: 0.2,`
			`stop: ["</s>"]`
			`};`

Expose generation timings from server & update completions.js (#2116) * use javascript generators as much cleaner API Also add ways to access completion as promise and EventSource * export llama_timings as struct and expose them in server * update readme, update baked includes * llama : uniform variable names + struct init --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-05 20:51:13 +00:00			`let generation_settings = null;`


			`// Completes the prompt as a generator. Recommended for most use cases.`
			`//`
			`// Example:`
			`//`
			`// import { llama } from '/completion.js'`
			`//`
			`// const request = llama("Tell me a joke", {n_predict: 800})`
			`// for await (const chunk of request) {`
			`// document.write(chunk.data.content)`
			`// }`
			`//`
			`export async function* llama(prompt, params = {}, config = {}) {`
			`let controller = config.controller;`

Simple webchat for server (#1998) * expose simple web interface on root domain * embed index and add --path for choosing static dir * allow server to multithread because web browsers send a lot of garbage requests we want the server to multithread when serving 404s for favicon's etc. To avoid blowing up llama we just take a mutex when it's invoked. * let's try this with the xxd tool instead and see if msvc is happier with that * enable server in Makefiles * add /completion.js file to make it easy to use the server from js * slightly nicer css * rework state management into session, expose historyTemplate to settings --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-04 14:05:27 +00:00			`if (!controller) {`
			`controller = new AbortController();`
			`}`

Expose generation timings from server & update completions.js (#2116) * use javascript generators as much cleaner API Also add ways to access completion as promise and EventSource * export llama_timings as struct and expose them in server * update readme, update baked includes * llama : uniform variable names + struct init --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-05 20:51:13 +00:00			`const completionParams = { ...paramDefaults, ...params, prompt };`

Simple webchat for server (#1998) * expose simple web interface on root domain * embed index and add --path for choosing static dir * allow server to multithread because web browsers send a lot of garbage requests we want the server to multithread when serving 404s for favicon's etc. To avoid blowing up llama we just take a mutex when it's invoked. * let's try this with the xxd tool instead and see if msvc is happier with that * enable server in Makefiles * add /completion.js file to make it easy to use the server from js * slightly nicer css * rework state management into session, expose historyTemplate to settings --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-04 14:05:27 +00:00			`const response = await fetch("/completion", {`
			`method: 'POST',`
			`body: JSON.stringify(completionParams),`
			`headers: {`
			`'Connection': 'keep-alive',`
			`'Content-Type': 'application/json',`
server : add optional API Key Authentication example (#4441) * Add API key authentication for enhanced server-client security * server : to snake_case --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-12-15 11:49:01 +00:00			`'Accept': 'text/event-stream',`
			...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
Simple webchat for server (#1998) * expose simple web interface on root domain * embed index and add --path for choosing static dir * allow server to multithread because web browsers send a lot of garbage requests we want the server to multithread when serving 404s for favicon's etc. To avoid blowing up llama we just take a mutex when it's invoked. * let's try this with the xxd tool instead and see if msvc is happier with that * enable server in Makefiles * add /completion.js file to make it easy to use the server from js * slightly nicer css * rework state management into session, expose historyTemplate to settings --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-04 14:05:27 +00:00			`},`
			`signal: controller.signal,`
			`});`

			`const reader = response.body.getReader();`
			`const decoder = new TextDecoder();`

			`let content = "";`
Fixing race condition in server and partial stream handling in frontend. (#2391) * Fixing race condition in server.cpp and partial stream handling in completion.js * Reverting assert edits. * Adding newline to eof 2023-08-04 11:37:24 +00:00			`let leftover = ""; // Buffer for partially read lines`
Simple webchat for server (#1998) * expose simple web interface on root domain * embed index and add --path for choosing static dir * allow server to multithread because web browsers send a lot of garbage requests we want the server to multithread when serving 404s for favicon's etc. To avoid blowing up llama we just take a mutex when it's invoked. * let's try this with the xxd tool instead and see if msvc is happier with that * enable server in Makefiles * add /completion.js file to make it easy to use the server from js * slightly nicer css * rework state management into session, expose historyTemplate to settings --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-04 14:05:27 +00:00
			`try {`
			`let cont = true;`

			`while (cont) {`
			`const result = await reader.read();`
			`if (result.done) {`
			`break;`
			`}`

Fixing race condition in server and partial stream handling in frontend. (#2391) * Fixing race condition in server.cpp and partial stream handling in completion.js * Reverting assert edits. * Adding newline to eof 2023-08-04 11:37:24 +00:00			`// Add any leftover data to the current chunk of data`
			`const text = leftover + decoder.decode(result.value);`
Simple webchat for server (#1998) * expose simple web interface on root domain * embed index and add --path for choosing static dir * allow server to multithread because web browsers send a lot of garbage requests we want the server to multithread when serving 404s for favicon's etc. To avoid blowing up llama we just take a mutex when it's invoked. * let's try this with the xxd tool instead and see if msvc is happier with that * enable server in Makefiles * add /completion.js file to make it easy to use the server from js * slightly nicer css * rework state management into session, expose historyTemplate to settings --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-04 14:05:27 +00:00
Fixing race condition in server and partial stream handling in frontend. (#2391) * Fixing race condition in server.cpp and partial stream handling in completion.js * Reverting assert edits. * Adding newline to eof 2023-08-04 11:37:24 +00:00			`// Check if the last character is a line break`
			`const endsWithLineBreak = text.endsWith('\n');`
Simple webchat for server (#1998) * expose simple web interface on root domain * embed index and add --path for choosing static dir * allow server to multithread because web browsers send a lot of garbage requests we want the server to multithread when serving 404s for favicon's etc. To avoid blowing up llama we just take a mutex when it's invoked. * let's try this with the xxd tool instead and see if msvc is happier with that * enable server in Makefiles * add /completion.js file to make it easy to use the server from js * slightly nicer css * rework state management into session, expose historyTemplate to settings --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-04 14:05:27 +00:00
Fixing race condition in server and partial stream handling in frontend. (#2391) * Fixing race condition in server.cpp and partial stream handling in completion.js * Reverting assert edits. * Adding newline to eof 2023-08-04 11:37:24 +00:00			`// Split the text into lines`
			`let lines = text.split('\n');`
Simple webchat for server (#1998) * expose simple web interface on root domain * embed index and add --path for choosing static dir * allow server to multithread because web browsers send a lot of garbage requests we want the server to multithread when serving 404s for favicon's etc. To avoid blowing up llama we just take a mutex when it's invoked. * let's try this with the xxd tool instead and see if msvc is happier with that * enable server in Makefiles * add /completion.js file to make it easy to use the server from js * slightly nicer css * rework state management into session, expose historyTemplate to settings --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-04 14:05:27 +00:00
Fixing race condition in server and partial stream handling in frontend. (#2391) * Fixing race condition in server.cpp and partial stream handling in completion.js * Reverting assert edits. * Adding newline to eof 2023-08-04 11:37:24 +00:00			`// If the text doesn't end with a line break, then the last line is incomplete`
			`// Store it in leftover to be added to the next chunk of data`
			`if (!endsWithLineBreak) {`
			`leftover = lines.pop();`
			`} else {`
			`leftover = ""; // Reset leftover if we have a line break at the end`
			`}`
Simple webchat for server (#1998) * expose simple web interface on root domain * embed index and add --path for choosing static dir * allow server to multithread because web browsers send a lot of garbage requests we want the server to multithread when serving 404s for favicon's etc. To avoid blowing up llama we just take a mutex when it's invoked. * let's try this with the xxd tool instead and see if msvc is happier with that * enable server in Makefiles * add /completion.js file to make it easy to use the server from js * slightly nicer css * rework state management into session, expose historyTemplate to settings --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-04 14:05:27 +00:00
Fixing race condition in server and partial stream handling in frontend. (#2391) * Fixing race condition in server.cpp and partial stream handling in completion.js * Reverting assert edits. * Adding newline to eof 2023-08-04 11:37:24 +00:00			`// Parse all sse events and add them to result`
			`const regex = /^(\S+):\s(.*)$/gm;`
			`for (const line of lines) {`
			`const match = regex.exec(line);`
			`if (match) {`
			`result[match[1]] = match[2]`
			`// since we know this is llama.cpp, let's just decode the json in data`
			`if (result.data) {`
			`result.data = JSON.parse(result.data);`
			`content += result.data.content;`

			`// yield`
			`yield result;`

			`// if we got a stop token from server, we will break here`
			`if (result.data.stop) {`
			`if (result.data.generation_settings) {`
			`generation_settings = result.data.generation_settings;`
			`}`
			`cont = false;`
			`break;`
			`}`
			`}`
server : throw an error when `slot unavailable` (#4741) 2024-01-03 08:43:19 +00:00			`if (result.error) {`
			`result.error = JSON.parse(result.error);`
			`if (result.error.content.includes('slot unavailable')) {`
			`// Throw an error to be caught by upstream callers`
			`throw new Error('slot unavailable');`
			`} else {`
			console.error(`llama.cpp error: ${result.error.content}`);
			`}`
			`}`
server : relay error messages (#4131) 2023-11-19 16:54:10 +00:00			`if (result.error) {`
			`result.error = JSON.parse(result.error);`
			console.error(`llama.cpp error: ${result.error.content}`);
			`}`
Expose generation timings from server & update completions.js (#2116) * use javascript generators as much cleaner API Also add ways to access completion as promise and EventSource * export llama_timings as struct and expose them in server * update readme, update baked includes * llama : uniform variable names + struct init --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-05 20:51:13 +00:00			`}`
Simple webchat for server (#1998) * expose simple web interface on root domain * embed index and add --path for choosing static dir * allow server to multithread because web browsers send a lot of garbage requests we want the server to multithread when serving 404s for favicon's etc. To avoid blowing up llama we just take a mutex when it's invoked. * let's try this with the xxd tool instead and see if msvc is happier with that * enable server in Makefiles * add /completion.js file to make it easy to use the server from js * slightly nicer css * rework state management into session, expose historyTemplate to settings --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-04 14:05:27 +00:00			`}`
			`}`
			`} catch (e) {`
Expose generation timings from server & update completions.js (#2116) * use javascript generators as much cleaner API Also add ways to access completion as promise and EventSource * export llama_timings as struct and expose them in server * update readme, update baked includes * llama : uniform variable names + struct init --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-05 20:51:13 +00:00			`if (e.name !== 'AbortError') {`
			`console.error("llama error: ", e);`
			`}`
Simple webchat for server (#1998) * expose simple web interface on root domain * embed index and add --path for choosing static dir * allow server to multithread because web browsers send a lot of garbage requests we want the server to multithread when serving 404s for favicon's etc. To avoid blowing up llama we just take a mutex when it's invoked. * let's try this with the xxd tool instead and see if msvc is happier with that * enable server in Makefiles * add /completion.js file to make it easy to use the server from js * slightly nicer css * rework state management into session, expose historyTemplate to settings --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-04 14:05:27 +00:00			`throw e;`
			`}`
			`finally {`
			`controller.abort();`
			`}`

			`return content;`
			`}`
Expose generation timings from server & update completions.js (#2116) * use javascript generators as much cleaner API Also add ways to access completion as promise and EventSource * export llama_timings as struct and expose them in server * update readme, update baked includes * llama : uniform variable names + struct init --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-05 20:51:13 +00:00
english : use `typos` to fix comments and logs (#4354) 2023-12-12 09:53:36 +00:00			`// Call llama, return an event target that you can subscribe to`
Expose generation timings from server & update completions.js (#2116) * use javascript generators as much cleaner API Also add ways to access completion as promise and EventSource * export llama_timings as struct and expose them in server * update readme, update baked includes * llama : uniform variable names + struct init --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> 2023-07-05 20:51:13 +00:00			`//`
			`// Example:`
			`//`
			`// import { llamaEventTarget } from '/completion.js'`
			`//`
			`// const conn = llamaEventTarget(prompt)`
			`// conn.addEventListener("message", (chunk) => {`
			`// document.write(chunk.detail.content)`
			`// })`
			`//`
			`export const llamaEventTarget = (prompt, params = {}, config = {}) => {`
			`const eventTarget = new EventTarget();`
			`(async () => {`
			`let content = "";`
			`for await (const chunk of llama(prompt, params, config)) {`
			`if (chunk.data) {`
			`content += chunk.data.content;`
			`eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));`
			`}`
			`if (chunk.data.generation_settings) {`
			`eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));`
			`}`
			`if (chunk.data.timings) {`
			`eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));`
			`}`
			`}`
			`eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));`
			`})();`
			`return eventTarget;`
			`}`

			`// Call llama, return a promise that resolves to the completed text. This does not support streaming`
			`//`
			`// Example:`
			`//`
			`// llamaPromise(prompt).then((content) => {`
			`// document.write(content)`
			`// })`
			`//`
			`// or`
			`//`
			`// const content = await llamaPromise(prompt)`
			`// document.write(content)`
			`//`
			`export const llamaPromise = (prompt, params = {}, config = {}) => {`
			`return new Promise(async (resolve, reject) => {`
			`let content = "";`
			`try {`
			`for await (const chunk of llama(prompt, params, config)) {`
			`content += chunk.data.content;`
			`}`
			`resolve(content);`
			`} catch (error) {`
			`reject(error);`
			`}`
			`});`
			`};`

			`/**`
			`* (deprecated)`
			`*/`
			`export const llamaComplete = async (params, controller, callback) => {`
			`for await (const chunk of llama(params.prompt, params, { controller })) {`
			`callback(chunk);`
			`}`
			`}`

			`// Get the model info from the server. This is useful for getting the context window and so on.`
			`export const llamaModelInfo = async () => {`
			`if (!generation_settings) {`
			`generation_settings = await fetch("/model.json").then(r => r.json());`
			`}`
			`return generation_settings;`
			`}`