Merge 24bad77ebf into 32d6ee6385

ggml : fix const usage in SSE path (#10962 )
server : fix missing model id in /model endpoint (#10957 )
2024-12-26 03:14:35 +00:00 · 2024-12-24 10:55:02 +08:00 · 2024-12-23 20:25:52 +01:00 · 2024-12-23 12:52:25 +01:00 · 2024-12-23 12:02:44 +01:00 · 2024-12-23 10:39:30 +02:00
6 changed files with 40 additions and 18 deletions
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@ -12,6 +12,10 @@
 #include "ggml-vulkan.h"
 #endif
 #ifdef GGML_USE_SYCL
 #include "ggml-sycl.h"
 #endif
 #include "ggml-rpc.h"
 #ifdef _WIN32
 #  include <windows.h>
@ -91,6 +95,12 @@ static ggml_backend_t create_backend() {
    if (!backend) {
        fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
    }
 #elif GGML_USE_SYCL
    fprintf(stderr, "%s: using SYCL backend\n", __func__);
    backend = ggml_backend_sycl_init(0); // init device 0
    if (!backend) {
        fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
    }
 #endif
    // if there aren't GPU Backends fallback to CPU backend
@ -106,6 +116,8 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
    ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
 #elif GGML_USE_VULKAN
    ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
 #elif GGML_USE_SYCL
    ggml_backend_sycl_get_device_memory(0, free_mem, total_mem);
 #else
    #ifdef _WIN32
        MEMORYSTATUSEX status;
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -724,7 +724,8 @@ This endpoint is public (no API key check). By default, it is read-only. To make
  },
  "total_slots": 1,
  "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
-  "chat_template": "..."
+  "chat_template": "...",
  "build_info": "b(build number)-(build commit hash)"
 }
 ```
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -595,10 +595,11 @@ struct server_task_result_cmpl_final : server_task_result {
        std::time_t t = std::time(0);
        json res = json {
-            {"choices", json::array({choice})},
+            {"choices",            json::array({choice})},
-            {"created", t},
+            {"created",            t},
-            {"model", oaicompat_model},
+            {"model",              oaicompat_model},
-            {"object", "chat.completion"},
+            {"system_fingerprint", build_info},
            {"object",             "chat.completion"},
            {"usage", json {
                {"completion_tokens", n_decoded},
                {"prompt_tokens",     n_prompt_tokens},
@ -632,11 +633,12 @@ struct server_task_result_cmpl_final : server_task_result {
        };
        json ret = json {
-            {"choices", json::array({choice})},
+            {"choices",            json::array({choice})},
-            {"created", t},
+            {"created",            t},
-            {"id",      oaicompat_cmpl_id},
+            {"id",                 oaicompat_cmpl_id},
-            {"model",   oaicompat_model},
+            {"model",              oaicompat_model},
-            {"object",  "chat.completion.chunk"},
+            {"system_fingerprint", build_info},
            {"object",             "chat.completion.chunk"},
            {"usage", json {
                {"completion_tokens", n_decoded},
                {"prompt_tokens",     n_prompt_tokens},
@ -761,11 +763,12 @@ struct server_task_result_cmpl_partial : server_task_result {
        }
        json ret = json {
-            {"choices", choices},
+            {"choices",            choices},
-            {"created", t},
+            {"created",            t},
-            {"id",      oaicompat_cmpl_id},
+            {"id",                 oaicompat_cmpl_id},
-            {"model",   oaicompat_model},
+            {"model",              oaicompat_model},
-            {"object",  "chat.completion.chunk"}
+            {"system_fingerprint", build_info},
            {"object",             "chat.completion.chunk"}
        };
        if (timings.prompt_n >= 0) {
@ -3476,6 +3479,7 @@ int main(int argc, char ** argv) {
            { "total_slots",                 ctx_server.params_base.n_parallel },
            { "model_path",                  ctx_server.params_base.model },
            { "chat_template",               llama_get_chat_template(ctx_server.model) },
            { "build_info",                  build_info },
        };
        res_ok(res, data);
@ -3697,7 +3701,7 @@ int main(int argc, char ** argv) {
            {"object", "list"},
            {"data", {
                {
-                    {"id",       params.model_alias},
+                    {"id",       params.model_alias.empty() ? params.model : params.model_alias},
                    {"object",   "model"},
                    {"created",  std::time(0)},
                    {"owned_by", "llamacpp"},
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@ -31,6 +31,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
    })
    assert res.status_code == 200
    assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
    assert res.body["system_fingerprint"].startswith("b")
    assert res.body["model"] == model if model is not None else server.model_alias
    assert res.body["usage"]["prompt_tokens"] == n_prompt
    assert res.body["usage"]["completion_tokens"] == n_predicted
@ -63,6 +64,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
    last_cmpl_id = None
    for data in res:
        choice = data["choices"][0]
        assert data["system_fingerprint"].startswith("b")
        assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
        if last_cmpl_id is None:
            last_cmpl_id = data["id"]
@ -92,6 +94,7 @@ def test_chat_completion_with_openai_library():
        seed=42,
        temperature=0.8,
    )
    assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
    assert res.choices[0].finish_reason == "length"
    assert res.choices[0].message.content is not None
    assert match_regex("(Suddenly)+", res.choices[0].message.content)
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -56,6 +56,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul
    }
 }
 const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
 //
 // tokenizer and input processing utils
 //
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -986,7 +986,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
 #define GGML_F16_STEP 32
 #define GGML_F16_EPR  4
-static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
+static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
    float tmp[4];
    tmp[0] = GGML_FP16_TO_FP32(x[0]);
@ -997,7 +997,7 @@ static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
    return _mm_loadu_ps(tmp);
 }
-static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
+static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
    float arr[4];
    _mm_storeu_ps(arr, y);
Author	SHA1	Message	Date
piDack	e76326e3e0	Merge `24bad77ebf` into `32d6ee6385`	2024-12-24 10:55:02 +08:00
Diego Devesa	32d6ee6385	ggml : fix const usage in SSE path (#10962 )	2024-12-23 20:25:52 +01:00
Xuan Son Nguyen	14b699ecde	server : fix missing model id in /model endpoint (#10957 ) Some checks are pending flake8 Lint / Lint (push) Waiting to run Details Python Type-Check / pyright type-check (push) Waiting to run Details * server : fix missing model id in /model endpoint * fix ci	2024-12-23 12:52:25 +01:00
Xuan Son Nguyen	485dc01214	server : add system_fingerprint to chat/completion (#10917 ) * server : add system_fingerprint to chat/completion * update README	2024-12-23 12:02:44 +01:00
Radoslav Gerganov	86bf31cfe6	rpc-server : add support for the SYCL backend (#10934 )	2024-12-23 10:39:30 +02:00