Compare commits

..

5 Commits

Author SHA1 Message Date
piDack
e76326e3e0
Merge 24bad77ebf into 32d6ee6385 2024-12-24 10:55:02 +08:00
Diego Devesa
32d6ee6385
ggml : fix const usage in SSE path (#10962) 2024-12-23 20:25:52 +01:00
Xuan Son Nguyen
14b699ecde
server : fix missing model id in /model endpoint (#10957)
Some checks are pending
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Waiting to run
* server : fix missing model id in /model endpoint

* fix ci
2024-12-23 12:52:25 +01:00
Xuan Son Nguyen
485dc01214
server : add system_fingerprint to chat/completion (#10917)
* server : add system_fingerprint to chat/completion

* update README
2024-12-23 12:02:44 +01:00
Radoslav Gerganov
86bf31cfe6
rpc-server : add support for the SYCL backend (#10934) 2024-12-23 10:39:30 +02:00
6 changed files with 40 additions and 18 deletions

View File

@ -12,6 +12,10 @@
#include "ggml-vulkan.h" #include "ggml-vulkan.h"
#endif #endif
#ifdef GGML_USE_SYCL
#include "ggml-sycl.h"
#endif
#include "ggml-rpc.h" #include "ggml-rpc.h"
#ifdef _WIN32 #ifdef _WIN32
# include <windows.h> # include <windows.h>
@ -91,6 +95,12 @@ static ggml_backend_t create_backend() {
if (!backend) { if (!backend) {
fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__); fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
} }
#elif GGML_USE_SYCL
fprintf(stderr, "%s: using SYCL backend\n", __func__);
backend = ggml_backend_sycl_init(0); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
}
#endif #endif
// if there aren't GPU Backends fallback to CPU backend // if there aren't GPU Backends fallback to CPU backend
@ -106,6 +116,8 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem); ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
#elif GGML_USE_VULKAN #elif GGML_USE_VULKAN
ggml_backend_vk_get_device_memory(0, free_mem, total_mem); ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
#elif GGML_USE_SYCL
ggml_backend_sycl_get_device_memory(0, free_mem, total_mem);
#else #else
#ifdef _WIN32 #ifdef _WIN32
MEMORYSTATUSEX status; MEMORYSTATUSEX status;

View File

@ -724,7 +724,8 @@ This endpoint is public (no API key check). By default, it is read-only. To make
}, },
"total_slots": 1, "total_slots": 1,
"model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
"chat_template": "..." "chat_template": "...",
"build_info": "b(build number)-(build commit hash)"
} }
``` ```

View File

@ -595,10 +595,11 @@ struct server_task_result_cmpl_final : server_task_result {
std::time_t t = std::time(0); std::time_t t = std::time(0);
json res = json { json res = json {
{"choices", json::array({choice})}, {"choices", json::array({choice})},
{"created", t}, {"created", t},
{"model", oaicompat_model}, {"model", oaicompat_model},
{"object", "chat.completion"}, {"system_fingerprint", build_info},
{"object", "chat.completion"},
{"usage", json { {"usage", json {
{"completion_tokens", n_decoded}, {"completion_tokens", n_decoded},
{"prompt_tokens", n_prompt_tokens}, {"prompt_tokens", n_prompt_tokens},
@ -632,11 +633,12 @@ struct server_task_result_cmpl_final : server_task_result {
}; };
json ret = json { json ret = json {
{"choices", json::array({choice})}, {"choices", json::array({choice})},
{"created", t}, {"created", t},
{"id", oaicompat_cmpl_id}, {"id", oaicompat_cmpl_id},
{"model", oaicompat_model}, {"model", oaicompat_model},
{"object", "chat.completion.chunk"}, {"system_fingerprint", build_info},
{"object", "chat.completion.chunk"},
{"usage", json { {"usage", json {
{"completion_tokens", n_decoded}, {"completion_tokens", n_decoded},
{"prompt_tokens", n_prompt_tokens}, {"prompt_tokens", n_prompt_tokens},
@ -761,11 +763,12 @@ struct server_task_result_cmpl_partial : server_task_result {
} }
json ret = json { json ret = json {
{"choices", choices}, {"choices", choices},
{"created", t}, {"created", t},
{"id", oaicompat_cmpl_id}, {"id", oaicompat_cmpl_id},
{"model", oaicompat_model}, {"model", oaicompat_model},
{"object", "chat.completion.chunk"} {"system_fingerprint", build_info},
{"object", "chat.completion.chunk"}
}; };
if (timings.prompt_n >= 0) { if (timings.prompt_n >= 0) {
@ -3476,6 +3479,7 @@ int main(int argc, char ** argv) {
{ "total_slots", ctx_server.params_base.n_parallel }, { "total_slots", ctx_server.params_base.n_parallel },
{ "model_path", ctx_server.params_base.model }, { "model_path", ctx_server.params_base.model },
{ "chat_template", llama_get_chat_template(ctx_server.model) }, { "chat_template", llama_get_chat_template(ctx_server.model) },
{ "build_info", build_info },
}; };
res_ok(res, data); res_ok(res, data);
@ -3697,7 +3701,7 @@ int main(int argc, char ** argv) {
{"object", "list"}, {"object", "list"},
{"data", { {"data", {
{ {
{"id", params.model_alias}, {"id", params.model_alias.empty() ? params.model : params.model_alias},
{"object", "model"}, {"object", "model"},
{"created", std::time(0)}, {"created", std::time(0)},
{"owned_by", "llamacpp"}, {"owned_by", "llamacpp"},

View File

@ -31,6 +31,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
}) })
assert res.status_code == 200 assert res.status_code == 200
assert "cmpl" in res.body["id"] # make sure the completion id has the expected format assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
assert res.body["system_fingerprint"].startswith("b")
assert res.body["model"] == model if model is not None else server.model_alias assert res.body["model"] == model if model is not None else server.model_alias
assert res.body["usage"]["prompt_tokens"] == n_prompt assert res.body["usage"]["prompt_tokens"] == n_prompt
assert res.body["usage"]["completion_tokens"] == n_predicted assert res.body["usage"]["completion_tokens"] == n_predicted
@ -63,6 +64,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
last_cmpl_id = None last_cmpl_id = None
for data in res: for data in res:
choice = data["choices"][0] choice = data["choices"][0]
assert data["system_fingerprint"].startswith("b")
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
if last_cmpl_id is None: if last_cmpl_id is None:
last_cmpl_id = data["id"] last_cmpl_id = data["id"]
@ -92,6 +94,7 @@ def test_chat_completion_with_openai_library():
seed=42, seed=42,
temperature=0.8, temperature=0.8,
) )
assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
assert res.choices[0].finish_reason == "length" assert res.choices[0].finish_reason == "length"
assert res.choices[0].message.content is not None assert res.choices[0].message.content is not None
assert match_regex("(Suddenly)+", res.choices[0].message.content) assert match_regex("(Suddenly)+", res.choices[0].message.content)

View File

@ -56,6 +56,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul
} }
} }
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
// //
// tokenizer and input processing utils // tokenizer and input processing utils
// //

View File

@ -986,7 +986,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
#define GGML_F16_STEP 32 #define GGML_F16_STEP 32
#define GGML_F16_EPR 4 #define GGML_F16_EPR 4
static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) { static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
float tmp[4]; float tmp[4];
tmp[0] = GGML_FP16_TO_FP32(x[0]); tmp[0] = GGML_FP16_TO_FP32(x[0]);
@ -997,7 +997,7 @@ static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
return _mm_loadu_ps(tmp); return _mm_loadu_ps(tmp);
} }
static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) { static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
float arr[4]; float arr[4];
_mm_storeu_ps(arr, y); _mm_storeu_ps(arr, y);