server : (UI) add tok/s, get rid of completion.js (#10786)

* get rid of completion.js * extract chat bubble to a component * add tok/s info * sync * fix BASE_URL * only extract timings when it's enabled * fix auto scroll
2025-01-12 19:50:17 +00:00 · 2024-12-11 20:52:14 +01:00 · 2024-12-11 20:52:14 +01:00 · 235f6e14bf
commit 235f6e14bf
parent 1a31d0dc00
6 changed files with 307 additions and 376 deletions
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
--- a/examples/server/webui/index.html
+++ b/examples/server/webui/index.html
@ -15,7 +15,7 @@
      <!-- sidebar -->
      <div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
        <label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
-        <div class="flex flex-col bg-base-200 min-h-full max-w-[calc(100vw-2em)] py-4 px-4">
+        <div class="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
          <div class="flex flex-row items-center justify-between mb-4 mt-4">
            <h2 class="font-bold ml-4">Conversations</h2>

@ -120,51 +120,25 @@
            {{ messages.length === 0 ? 'Send a message to start' : '' }}
          </div>
          <div v-for="msg in messages" class="group">
-            <div :class="{
-              'chat': true,
-              'chat-start': msg.role !== 'user',
-              'chat-end': msg.role === 'user',
-            }">
-              <div :class="{
-                'chat-bubble markdown': true,
-                'chat-bubble-base-300': msg.role !== 'user',
-              }">
-                <!-- textarea for editing message -->
-                <template v-if="editingMsg && editingMsg.id === msg.id">
-                  <textarea
-                    class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
-                    v-model="msg.content"></textarea>
-                  <br/>
-                  <button class="btn btn-ghost mt-2 mr-2" @click="editingMsg = null">Cancel</button>
-                  <button class="btn mt-2" @click="editUserMsgAndRegenerate(msg)">Submit</button>
-                </template>
-                <!-- render message as markdown -->
-                <vue-markdown v-else :source="msg.content" />
-              </div>
-            </div>
-
-            <!-- actions for each message -->
-            <div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
-              <!-- user message -->
-              <button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingMsg = msg" :disabled="isGenerating">
-                ✍️ Edit
-              </button>
-              <!-- assistant message -->
-              <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
-                🔄 Regenerate
-              </button>
-              <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
-                📋 Copy
-              </button>
-            </div>
+            <message-bubble
+              :config="config"
+              :msg="msg"
+              :key="msg.id"
+              :is-generating="isGenerating"
+              :edit-user-msg-and-regenerate="editUserMsgAndRegenerate"
+              :regenerate-msg="regenerateMsg"></message-bubble>
          </div>

          <!-- pending (ongoing) assistant message -->
-          <div id="pending-msg" class="chat chat-start">
-            <div v-if="pendingMsg" class="chat-bubble markdown chat-bubble-base-300">
-              <span v-if="!pendingMsg.content" class="loading loading-dots loading-md"></span>
-              <vue-markdown v-else :source="pendingMsg.content" />
-            </div>
+          <div id="pending-msg" class="group">
+            <message-bubble
+              v-if="pendingMsg"
+              :config="config"
+              :msg="pendingMsg"
+              :key="pendingMsg.id"
+              :is-generating="isGenerating"
+              :edit-user-msg-and-regenerate="() => {}"
+              :regenerate-msg="() => {}"></message-bubble>
          </div>
        </div>

@ -227,6 +201,10 @@
          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
            <summary class="collapse-title font-bold">Advanced config</summary>
            <div class="collapse-content">
+              <div class="flex flex-row items-center mb-2">
+                <input type="checkbox" class="checkbox" v-model="config.showTokensPerSecond" />
+                <span class="ml-4">Show tokens per second</span>
+              </div>
              <label class="form-control mb-2">
                <!-- Custom parameters input -->
                <div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
@ -247,6 +225,66 @@

  </div>

+
+  <!-- Template to be used as message bubble -->
+  <template id="message-bubble">
+    <div :class="{
+      'chat': true,
+      'chat-start': msg.role !== 'user',
+      'chat-end': msg.role === 'user',
+    }">
+      <div :class="{
+        'chat-bubble markdown': true,
+        'chat-bubble-base-300': msg.role !== 'user',
+      }">
+        <!-- textarea for editing message -->
+        <template v-if="editingContent !== null">
+          <textarea
+            class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
+            v-model="editingContent"></textarea>
+          <br/>
+          <button class="btn btn-ghost mt-2 mr-2" @click="editingContent = null">Cancel</button>
+          <button class="btn mt-2" @click="editMsg()">Submit</button>
+        </template>
+        <template v-else>
+          <!-- show loading dots for pending message -->
+          <span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
+          <!-- render message as markdown -->
+          <vue-markdown v-else :source="msg.content"></vue-markdown>
+          <!-- render timings if enabled -->
+          <div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
+            <div tabindex="0" role="button" class="cursor-pointer font-semibold text-sm opacity-60">Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s</div>
+            <div class="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
+              <b>Prompt</b><br/>
+              - Tokens: {{ timings.prompt_n }}<br/>
+              - Time: {{ timings.prompt_ms }} ms<br/>
+              - Speed: {{ timings.prompt_per_second.toFixed(1) }} t/s<br/>
+              <b>Generation</b><br/>
+              - Tokens: {{ timings.predicted_n }}<br/>
+              - Time: {{ timings.predicted_ms }} ms<br/>
+              - Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s<br/>
+            </div>
+          </div>
+        </template>
+      </div>
+    </div>
+    <!-- actions for each message -->
+    <div :class="{'text-right': msg.role === 'user', 'opacity-0': isGenerating}" class="mx-4 mt-2 mb-2">
+      <!-- user message -->
+      <button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingContent = msg.content" :disabled="isGenerating">
+        ✍️ Edit
+      </button>
+      <!-- assistant message -->
+      <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
+        🔄 Regenerate
+      </button>
+      <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg()" :disabled="isGenerating">
+        📋 Copy
+      </button>
+    </div>
+  </template>
+
+
  <!-- Template to be used by settings modal -->
  <template id="settings-modal-short-input">
    <label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
--- a/examples/server/webui/package-lock.json
+++ b/examples/server/webui/package-lock.json
@ -13,6 +13,7 @@
        "markdown-it": "^14.1.0",
        "postcss": "^8.4.49",
        "tailwindcss": "^3.4.15",
+        "textlinestream": "^1.1.1",
        "vite-plugin-singlefile": "^2.0.3",
        "vue": "^3.5.13"
      },
@ -2677,6 +2678,12 @@
        "node": ">=14.0.0"
      }
    },
+    "node_modules/textlinestream": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/textlinestream/-/textlinestream-1.1.1.tgz",
+      "integrity": "sha512-iBHbi7BQxrFmwZUQJsT0SjNzlLLsXhvW/kg7EyOMVMBIrlnj/qYofwo1LVLZi+3GbUEo96Iu2eqToI2+lZoAEQ==",
+      "license": "MIT"
+    },
    "node_modules/uc.micro": {
      "version": "2.1.0",
      "resolved": "https://registry.npmjs.org/uc.micro/-/uc.micro-2.1.0.tgz",
--- a/examples/server/webui/package.json
+++ b/examples/server/webui/package.json
@ -17,6 +17,7 @@
    "markdown-it": "^14.1.0",
    "postcss": "^8.4.49",
    "tailwindcss": "^3.4.15",
+    "textlinestream": "^1.1.1",
    "vite-plugin-singlefile": "^2.0.3",
    "vue": "^3.5.13"
  }
--- a/examples/server/webui/src/completion.js
+++ b/examples/server/webui/src/completion.js
@ -1,225 +0,0 @@
-const paramDefaults = {
-  stream: true,
-  temperature: 0.2,
-};
-
-let generation_settings = null;
-
-export class CompletionError extends Error {
-  constructor(message, name, data) {
-    super(message);
-    this.name = name;
-  }
-};
-
-// Completes the prompt as a generator. Recommended for most use cases.
-//
-// Example:
-//
-//    import { llama } from '/completion.js'
-//
-//    const request = llama("Tell me a joke", {n_predict: 800})
-//    for await (const chunk of request) {
-//      document.write(chunk.data.content)
-//    }
-//
-export async function* llama(prompt, params = {}, config = {}) {
-  let controller = config.controller;
-  const api_url = config.api_url?.replace(/\/+$/, '') || "";
-
-  if (!controller) {
-    controller = new AbortController();
-  }
-
-  const completionParams = { ...paramDefaults, ...params, prompt };
-
-  const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
-    method: 'POST',
-    body: JSON.stringify(completionParams),
-    headers: {
-      'Connection': 'keep-alive',
-      'Content-Type': 'application/json',
-      'Accept': 'text/event-stream',
-      ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
-    },
-    signal: controller.signal,
-  });
-
-  const status = response.status;
-  if (status !== 200) {
-    try {
-      const body = await response.json();
-      if (body && body.error && body.error.message) {
-        throw new CompletionError(body.error.message, 'ServerError');
-      }
-    } catch (err) {
-      throw new CompletionError(err.message, 'ServerError');
-    }
-  }
-
-  const reader = response.body.getReader();
-  const decoder = new TextDecoder();
-
-  let content = "";
-  let leftover = ""; // Buffer for partially read lines
-
-  try {
-    let cont = true;
-
-    while (cont) {
-      const result = await reader.read();
-      if (result.done) {
-        break;
-      }
-
-      // Add any leftover data to the current chunk of data
-      const text = leftover + decoder.decode(result.value);
-
-      // Check if the last character is a line break
-      const endsWithLineBreak = text.endsWith('\n');
-
-      // Split the text into lines
-      let lines = text.split('\n');
-
-      // If the text doesn't end with a line break, then the last line is incomplete
-      // Store it in leftover to be added to the next chunk of data
-      if (!endsWithLineBreak) {
-        leftover = lines.pop();
-      } else {
-        leftover = ""; // Reset leftover if we have a line break at the end
-      }
-
-      // Parse all sse events and add them to result
-      const regex = /^(\S+):\s(.*)$/gm;
-      for (const line of lines) {
-        const match = regex.exec(line);
-        if (match) {
-          result[match[1]] = match[2];
-          if (result.data === '[DONE]') {
-            cont = false;
-            break;
-          }
-
-          // since we know this is llama.cpp, let's just decode the json in data
-          if (result.data) {
-            result.data = JSON.parse(result.data);
-            content += result.data.content;
-
-            // yield
-            yield result;
-
-            // if we got a stop token from server, we will break here
-            if (result.data.stop) {
-              if (result.data.generation_settings) {
-                generation_settings = result.data.generation_settings;
-              }
-              cont = false;
-              break;
-            }
-          }
-          if (result.error) {
-            try {
-              result.error = JSON.parse(result.error);
-              if (result.error.message.includes('slot unavailable')) {
-                // Throw an error to be caught by upstream callers
-                throw new Error('slot unavailable');
-              } else {
-                console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
-              }
-            } catch(e) {
-              console.error(`llama.cpp error ${result.error}`)
-            }
-          }
-        }
-      }
-    }
-  } catch (e) {
-    if (e.name !== 'AbortError') {
-      console.error("llama error: ", e);
-    }
-    throw e;
-  }
-  finally {
-    controller.abort();
-  }
-
-  return content;
-}
-
-// Call llama, return an event target that you can subscribe to
-//
-// Example:
-//
-//    import { llamaEventTarget } from '/completion.js'
-//
-//    const conn = llamaEventTarget(prompt)
-//    conn.addEventListener("message", (chunk) => {
-//      document.write(chunk.detail.content)
-//    })
-//
-export const llamaEventTarget = (prompt, params = {}, config = {}) => {
-  const eventTarget = new EventTarget();
-  (async () => {
-    let content = "";
-    for await (const chunk of llama(prompt, params, config)) {
-      if (chunk.data) {
-        content += chunk.data.content;
-        eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
-      }
-      if (chunk.data.generation_settings) {
-        eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
-      }
-      if (chunk.data.timings) {
-        eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
-      }
-    }
-    eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
-  })();
-  return eventTarget;
-}
-
-// Call llama, return a promise that resolves to the completed text. This does not support streaming
-//
-// Example:
-//
-//     llamaPromise(prompt).then((content) => {
-//       document.write(content)
-//     })
-//
-//     or
-//
-//     const content = await llamaPromise(prompt)
-//     document.write(content)
-//
-export const llamaPromise = (prompt, params = {}, config = {}) => {
-  return new Promise(async (resolve, reject) => {
-    let content = "";
-    try {
-      for await (const chunk of llama(prompt, params, config)) {
-        content += chunk.data.content;
-      }
-      resolve(content);
-    } catch (error) {
-      reject(error);
-    }
-  });
-};
-
-/**
- * (deprecated)
- */
-export const llamaComplete = async (params, controller, callback) => {
-  for await (const chunk of llama(params.prompt, params, { controller })) {
-    callback(chunk);
-  }
-}
-
-// Get the model info from the server. This is useful for getting the context window and so on.
-export const llamaModelInfo = async (config = {}) => {
-  if (!generation_settings) {
-    const api_url = config.api_url?.replace(/\/+$/, '') || "";
-    const props = await fetch(`${api_url}/props`).then(r => r.json());
-    generation_settings = props.default_generation_settings;
-  }
-  return generation_settings;
-}
--- a/examples/server/webui/src/main.js
+++ b/examples/server/webui/src/main.js
@ -1,21 +1,25 @@
 import './styles.css';
 import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js';
-import { llama } from './completion.js';
 import MarkdownIt from 'markdown-it';
+import TextLineStream from 'textlinestream';
+
+const isDev = import.meta.env.MODE === 'development';

 // utility functions
 const isString = (x) => !!x.toLowerCase;
-const isNumeric = (n) => !isString(n) && !isNaN(n);
+const isBoolean = (x) => x === true || x === false;
+const isNumeric = (n) => !isString(n) && !isNaN(n) && !isBoolean(n);
 const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
 const copyStr = (str) => navigator.clipboard.writeText(str);

 // constants
 const BASE_URL = localStorage.getItem('base') // for debugging
-  || (new URL('.', document.baseURI).href).toString(); // for production
+  || (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production
 const CONFIG_DEFAULT = {
  // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
  apiKey: '',
  systemMessage: 'You are a helpful assistant.',
+  showTokensPerSecond: false,
  // make sure these default values are in sync with `common.h`
  samplers: 'dkypmxt',
  temperature: 0.8,
@ -101,6 +105,48 @@ const SettingsModalShortInput = defineComponent({
  },
 });

+// message bubble component
+const MessageBubble = defineComponent({
+  components: {
+    VueMarkdown
+  },
+  template: document.getElementById('message-bubble').innerHTML,
+  props: {
+    config: Object,
+    msg: Object,
+    isGenerating: Boolean,
+    editUserMsgAndRegenerate: Function,
+    regenerateMsg: Function,
+  },
+  data() {
+    return {
+      editingContent: null,
+    };
+  },
+  computed: {
+    timings() {
+      if (!this.msg.timings) return null;
+      return {
+        ...this.msg.timings,
+        prompt_per_second: this.msg.timings.prompt_n / (this.msg.timings.prompt_ms / 1000),
+        predicted_per_second: this.msg.timings.predicted_n / (this.msg.timings.predicted_ms / 1000),
+      };
+    }
+  },
+  methods: {
+    copyMsg() {
+      copyStr(this.msg.content);
+    },
+    editMsg() {
+      this.editUserMsgAndRegenerate({
+        ...this.msg,
+        content: this.editingContent,
+      });
+      this.editingContent = null;
+    },
+  },
+});
+
 // coversations is stored in localStorage
 // format: { [convId]: { id: string, lastModified: number, messages: [...] } }
 // convId is a string prefixed with 'conv-'
@ -192,10 +238,29 @@ const chatScrollToBottom = (requiresNearBottom) => {
  }
 };

+// wrapper for SSE
+async function* sendSSEPostRequest(url, fetchOptions) {
+  const res = await fetch(url, fetchOptions);
+  const lines = res.body
+    .pipeThrough(new TextDecoderStream())
+    .pipeThrough(new TextLineStream());
+  for await (const line of lines) {
+    if (isDev) console.log({line});
+    if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
+      const data = JSON.parse(line.slice(5));
+      yield data;
+    } else if (line.startsWith('error:')) {
+      const data = JSON.parse(line.slice(6));
+      throw new Error(data.message || 'Unknown error');
+    }
+  }
+};
+
 const mainApp = createApp({
  components: {
    VueMarkdown,
    SettingsModalShortInput,
+    MessageBubble,
  },
  data() {
    return {
@ -209,7 +274,6 @@ const mainApp = createApp({
      selectedTheme: StorageUtils.getTheme(),
      config: StorageUtils.getConfig(),
      showConfigDialog: false,
-      editingMsg: null,
      // const
      themes: THEMES,
      configDefault: {...CONFIG_DEFAULT},
@ -226,6 +290,15 @@ const mainApp = createApp({
    });
    resizeObserver.observe(pendingMsgElem);
  },
+  watch: {
+    viewingConvId: function(val, oldVal) {
+      if (val != oldVal) {
+        this.fetchMessages();
+        chatScrollToBottom();
+        this.hideSidebar();
+      }
+    }
+  },
  methods: {
    hideSidebar() {
      document.getElementById('toggle-drawer').checked = false;
@ -237,18 +310,10 @@ const mainApp = createApp({
    newConversation() {
      if (this.isGenerating) return;
      this.viewingConvId = StorageUtils.getNewConvId();
-      this.editingMsg = null;
-      this.fetchMessages();
-      chatScrollToBottom();
-      this.hideSidebar();
    },
    setViewingConv(convId) {
      if (this.isGenerating) return;
      this.viewingConvId = convId;
-      this.editingMsg = null;
-      this.fetchMessages();
-      chatScrollToBottom();
-      this.hideSidebar();
    },
    deleteConv(convId) {
      if (this.isGenerating) return;
@ -256,7 +321,6 @@ const mainApp = createApp({
        StorageUtils.remove(convId);
        if (this.viewingConvId === convId) {
          this.viewingConvId = StorageUtils.getNewConvId();
-          this.editingMsg = null;
        }
        this.fetchConversation();
        this.fetchMessages();
@ -291,7 +355,6 @@ const mainApp = createApp({
      this.fetchConversation();
      this.fetchMessages();
      this.inputMsg = '';
-      this.editingMsg = null;
      this.generateMessage(currConvId);
      chatScrollToBottom();
    },
@ -299,7 +362,6 @@ const mainApp = createApp({
      if (this.isGenerating) return;
      this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
      this.isGenerating = true;
-      this.editingMsg = null;

      try {
        const abortController = new AbortController();
@ -330,17 +392,21 @@ const mainApp = createApp({
          dry_allowed_length: this.config.dry_allowed_length,
          dry_penalty_last_n: this.config.dry_penalty_last_n,
          max_tokens: this.config.max_tokens,
+          timings_per_token: !!this.config.showTokensPerSecond,
          ...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
-          ...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),
        };
-        const config = {
-          controller: abortController,
-          api_url: BASE_URL,
-          endpoint: '/chat/completions',
-        };
-        for await (const chunk of llama(prompt, params, config)) {
-          const stop = chunk.data.stop;
-          const addedContent = chunk.data.choices[0].delta.content;
+        const chunks = sendSSEPostRequest(`${BASE_URL}/v1/chat/completions`, {
+          method: 'POST',
+          headers: {
+            'Content-Type': 'application/json',
+            'Authorization': this.config.apiKey ? `Bearer ${this.config.apiKey}` : undefined,
+          },
+          body: JSON.stringify(params),
+          signal: abortController.signal,
+        });
+        for await (const chunk of chunks) {
+          const stop = chunk.stop;
+          const addedContent = chunk.choices[0].delta.content;
          const lastContent = this.pendingMsg.content || '';
          if (addedContent) {
            this.pendingMsg = {
@ -349,6 +415,16 @@ const mainApp = createApp({
              content: lastContent + addedContent,
            };
          }
+          const timings = chunk.timings;
+          if (timings && this.config.showTokensPerSecond) {
+            // only extract what's really needed, to save some space
+            this.pendingMsg.timings = {
+              prompt_n: timings.prompt_n,
+              prompt_ms: timings.prompt_ms,
+              predicted_n: timings.predicted_n,
+              predicted_ms: timings.predicted_ms,
+            };
+          }
        }

        StorageUtils.appendMsg(currConvId, this.pendingMsg);
@ -387,14 +463,10 @@ const mainApp = createApp({
      this.fetchMessages();
      this.generateMessage(currConvId);
    },
-    copyMsg(msg) {
-      copyStr(msg.content);
-    },
    editUserMsgAndRegenerate(msg) {
      if (this.isGenerating) return;
      const currConvId = this.viewingConvId;
      const newContent = msg.content;
-      this.editingMsg = null;
      StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
      StorageUtils.appendMsg(currConvId, {
        id: Date.now(),