Compare commits

..

1 Commits

Author SHA1 Message Date
piDack
e76326e3e0
Merge 24bad77ebf into 32d6ee6385 2024-12-24 10:55:02 +08:00
16 changed files with 338 additions and 511 deletions

View File

@ -34,7 +34,6 @@ endforeach()
add_executable(${TARGET} ${TARGET_SRCS}) add_executable(${TARGET} ${TARGET_SRCS})
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
if (LLAMA_SERVER_SSL) if (LLAMA_SERVER_SSL)

View File

@ -450,8 +450,6 @@ These words will not be included in the completion, so make sure to add them to
`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain. `post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain.
`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error.
**Response format** **Response format**
- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support. - Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.

View File

@ -92,7 +92,6 @@ struct slot_params {
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
std::vector<std::string> antiprompt; std::vector<std::string> antiprompt;
std::vector<std::string> response_fields;
bool timings_per_token = false; bool timings_per_token = false;
bool post_sampling_probs = false; bool post_sampling_probs = false;
bool ignore_eos = false; bool ignore_eos = false;
@ -210,7 +209,6 @@ struct server_task {
params.n_discard = json_value(data, "n_discard", defaults.n_discard); params.n_discard = json_value(data, "n_discard", defaults.n_discard);
//params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement //params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
params.response_fields = json_value(data, "response_fields", std::vector<std::string>());
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
@ -524,7 +522,6 @@ struct server_task_result_cmpl_final : server_task_result {
bool post_sampling_probs; bool post_sampling_probs;
std::vector<completion_token_output> probs_output; std::vector<completion_token_output> probs_output;
std::vector<std::string> response_fields;
slot_params generation_params; slot_params generation_params;
@ -571,7 +568,7 @@ struct server_task_result_cmpl_final : server_task_result {
if (!stream && !probs_output.empty()) { if (!stream && !probs_output.empty()) {
res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs); res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
} }
return response_fields.empty() ? res : json_get_nested_values(response_fields, res); return res;
} }
json to_json_oaicompat_chat() { json to_json_oaicompat_chat() {
@ -2069,7 +2066,6 @@ struct server_context {
res->tokens = slot.generated_tokens; res->tokens = slot.generated_tokens;
res->timings = slot.get_timings(); res->timings = slot.get_timings();
res->prompt = common_detokenize(ctx, slot.prompt_tokens, true); res->prompt = common_detokenize(ctx, slot.prompt_tokens, true);
res->response_fields = slot.params.response_fields;
res->truncated = slot.truncated; res->truncated = slot.truncated;
res->n_decoded = slot.n_decoded; res->n_decoded = slot.n_decoded;
@ -3790,17 +3786,6 @@ int main(int argc, char ** argv) {
return; return;
} }
bool use_base64 = false;
if (body.count("encoding_format") != 0) {
const std::string& format = body.at("encoding_format");
if (format == "base64") {
use_base64 = true;
} else if (format != "float") {
res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST));
return;
}
}
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true); std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
for (const auto & tokens : tokenized_prompts) { for (const auto & tokens : tokenized_prompts) {
// this check is necessary for models that do not add BOS token to the input // this check is necessary for models that do not add BOS token to the input
@ -3852,7 +3837,7 @@ int main(int argc, char ** argv) {
} }
// write JSON response // write JSON response
json root = oaicompat ? format_embeddings_response_oaicompat(body, responses, use_base64) : json(responses); json root = oaicompat ? format_embeddings_response_oaicompat(body, responses) : json(responses);
res_ok(res, root); res_ok(res, root);
}; };

View File

@ -95,7 +95,7 @@ def test_consistent_result_same_seed(n_slots: int):
res = server.make_request("POST", "/completion", data={ res = server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is", "prompt": "I believe the meaning of life is",
"seed": 42, "seed": 42,
"temperature": 0.0, "temperature": 1.0,
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
}) })
if last_res is not None: if last_res is not None:
@ -120,10 +120,9 @@ def test_different_result_different_seed(n_slots: int):
assert res.body["content"] != last_res.body["content"] assert res.body["content"] != last_res.body["content"]
last_res = res last_res = res
# TODO figure why it don't work with temperature = 1
# @pytest.mark.parametrize("temperature", [0.0, 1.0])
@pytest.mark.parametrize("n_batch", [16, 32]) @pytest.mark.parametrize("n_batch", [16, 32])
@pytest.mark.parametrize("temperature", [0.0]) @pytest.mark.parametrize("temperature", [0.0, 1.0])
def test_consistent_result_different_batch_size(n_batch: int, temperature: float): def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
global server global server
server.n_batch = n_batch server.n_batch = n_batch
@ -258,40 +257,6 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int):
# assert match_regex(re_content, res.body["content"]) # assert match_regex(re_content, res.body["content"])
@pytest.mark.parametrize(
"prompt,n_predict,response_fields",
[
("I believe the meaning of life is", 8, []),
("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]),
],
)
def test_completion_response_fields(
prompt: str, n_predict: int, response_fields: list[str]
):
global server
server.start()
res = server.make_request(
"POST",
"/completion",
data={
"n_predict": n_predict,
"prompt": prompt,
"response_fields": response_fields,
},
)
assert res.status_code == 200
assert "content" in res.body
assert len(res.body["content"])
if len(response_fields):
assert res.body["generation_settings/n_predict"] == n_predict
assert res.body["prompt"] == "<s> " + prompt
assert isinstance(res.body["content"], str)
assert len(res.body) == len(response_fields)
else:
assert len(res.body)
assert "generation_settings" in res.body
def test_n_probs(): def test_n_probs():
global server global server
server.start() server.start()

View File

@ -1,5 +1,3 @@
import base64
import struct
import pytest import pytest
from openai import OpenAI from openai import OpenAI
from utils import * from utils import *
@ -196,42 +194,3 @@ def test_embedding_usage_multiple():
assert res.status_code == 200 assert res.status_code == 200
assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens'] assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
assert res.body['usage']['prompt_tokens'] == 2 * 9 assert res.body['usage']['prompt_tokens'] == 2 * 9
def test_embedding_openai_library_base64():
server.start()
test_input = "Test base64 embedding output"
# get embedding in default format
res = server.make_request("POST", "/v1/embeddings", data={
"input": test_input
})
assert res.status_code == 200
vec0 = res.body["data"][0]["embedding"]
# get embedding in base64 format
res = server.make_request("POST", "/v1/embeddings", data={
"input": test_input,
"encoding_format": "base64"
})
assert res.status_code == 200
assert "data" in res.body
assert len(res.body["data"]) == 1
embedding_data = res.body["data"][0]
assert "embedding" in embedding_data
assert isinstance(embedding_data["embedding"], str)
# Verify embedding is valid base64
decoded = base64.b64decode(embedding_data["embedding"])
# Verify decoded data can be converted back to float array
float_count = len(decoded) // 4 # 4 bytes per float
floats = struct.unpack(f'{float_count}f', decoded)
assert len(floats) > 0
assert all(isinstance(x, float) for x in floats)
assert len(floats) == len(vec0)
# make sure the decoded data is the same as the original
for x, y in zip(floats, vec0):
assert abs(x - y) < EPSILON

View File

@ -3,7 +3,6 @@
#include "common.h" #include "common.h"
#include "log.h" #include "log.h"
#include "llama.h" #include "llama.h"
#include "common/base64.hpp"
#ifndef NDEBUG #ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error // crash the server in debug mode, otherwise send an http 500 error
@ -91,28 +90,6 @@ static bool json_is_array_of_mixed_numbers_strings(const json & data) {
return false; return false;
} }
// get value by path(key1 / key2)
static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
json result = json::object();
for (const std::string & path : paths) {
json current = js;
const auto keys = string_split<std::string>(path, /*separator*/ '/');
bool valid_path = true;
for (const std::string & k : keys) {
if (valid_path && current.is_object() && current.contains(k)) {
current = current[k];
} else {
valid_path = false;
}
}
if (valid_path) {
result[path] = current;
}
}
return result;
}
/** /**
* this handles 2 cases: * this handles 2 cases:
* - only string, example: "string" * - only string, example: "string"
@ -614,31 +591,16 @@ static json oaicompat_completion_params_parse(
return llama_params; return llama_params;
} }
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false) { static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
json data = json::array(); json data = json::array();
int32_t n_tokens = 0; int32_t n_tokens = 0;
int i = 0; int i = 0;
for (const auto & elem : embeddings) { for (const auto & elem : embeddings) {
json embedding_obj; data.push_back(json{
if (use_base64) {
const auto& vec = json_value(elem, "embedding", json::array()).get<std::vector<float>>();
const char* data_ptr = reinterpret_cast<const char*>(vec.data());
size_t data_size = vec.size() * sizeof(float);
embedding_obj = {
{"embedding", base64::encode(data_ptr, data_size)},
{"index", i++},
{"object", "embedding"},
{"encoding_format", "base64"}
};
} else {
embedding_obj = {
{"embedding", json_value(elem, "embedding", json::array())}, {"embedding", json_value(elem, "embedding", json::array())},
{"index", i++}, {"index", i++},
{"object", "embedding"} {"object", "embedding"}
}; });
}
data.push_back(embedding_obj);
n_tokens += json_value(elem, "tokens_evaluated", 0); n_tokens += json_value(elem, "tokens_evaluated", 0);
} }

View File

@ -234,7 +234,6 @@ function(ggml_add_backend_library backend)
# write the shared library to the output directory # write the shared library to the output directory
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL) target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
add_dependencies(ggml ${backend})
else() else()
add_library(${backend} ${ARGN}) add_library(${backend} ${ARGN})
target_link_libraries(ggml PUBLIC ${backend}) target_link_libraries(ggml PUBLIC ${backend})

View File

@ -66,26 +66,6 @@
#include "ggml-kompute.h" #include "ggml-kompute.h"
#endif #endif
// disable C++17 deprecation warning for std::codecvt_utf8
#if defined(__clang__)
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
static std::wstring utf8_to_utf16(const std::string & str) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.from_bytes(str);
}
static std::string utf16_to_utf8(const std::wstring & str) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.to_bytes(str);
}
#if defined(__clang__)
# pragma clang diagnostic pop
#endif
#ifdef _WIN32 #ifdef _WIN32
using dl_handle = std::remove_pointer_t<HMODULE>; using dl_handle = std::remove_pointer_t<HMODULE>;
@ -108,6 +88,11 @@ static dl_handle * dl_load_library(const std::wstring & path) {
return handle; return handle;
} }
static dl_handle * dl_load_library(const std::string & path) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return dl_load_library(converter.from_bytes(path));
}
static void * dl_get_sym(dl_handle * handle, const char * name) { static void * dl_get_sym(dl_handle * handle, const char * name) {
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
@ -129,8 +114,8 @@ struct dl_handle_deleter {
} }
}; };
static void * dl_load_library(const std::wstring & path) { static void * dl_load_library(const std::string & path) {
dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL); dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
return handle; return handle;
} }
@ -217,11 +202,11 @@ struct ggml_backend_registry {
devices.push_back(device); devices.push_back(device);
} }
ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) { ggml_backend_reg_t load_backend(const char * path, bool silent) {
dl_handle_ptr handle { dl_load_library(path) }; dl_handle_ptr handle { dl_load_library(path) };
if (!handle) { if (!handle) {
if (!silent) { if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str()); GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
} }
return nullptr; return nullptr;
} }
@ -229,7 +214,7 @@ struct ggml_backend_registry {
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
if (score_fn && score_fn() == 0) { if (score_fn && score_fn() == 0) {
if (!silent) { if (!silent) {
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str()); GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
} }
return nullptr; return nullptr;
} }
@ -237,7 +222,7 @@ struct ggml_backend_registry {
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init"); auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
if (!backend_init_fn) { if (!backend_init_fn) {
if (!silent) { if (!silent) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str()); GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
} }
return nullptr; return nullptr;
} }
@ -246,16 +231,16 @@ struct ggml_backend_registry {
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) { if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
if (!silent) { if (!silent) {
if (!reg) { if (!reg) {
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str()); GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
} else { } else {
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n", GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
__func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION); __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
} }
} }
return nullptr; return nullptr;
} }
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str()); GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
register_backend(reg, std::move(handle)); register_backend(reg, std::move(handle));
@ -391,14 +376,14 @@ ggml_backend_t ggml_backend_init_best(void) {
// Dynamic loading // Dynamic loading
ggml_backend_reg_t ggml_backend_load(const char * path) { ggml_backend_reg_t ggml_backend_load(const char * path) {
return get_reg().load_backend(utf8_to_utf16(path), false); return get_reg().load_backend(path, false);
} }
void ggml_backend_unload(ggml_backend_reg_t reg) { void ggml_backend_unload(ggml_backend_reg_t reg) {
get_reg().unload_backend(reg, true); get_reg().unload_backend(reg, true);
} }
static std::wstring get_executable_path() { static std::string get_executable_path() {
#if defined(__APPLE__) #if defined(__APPLE__)
// get executable path // get executable path
std::vector<char> path; std::vector<char> path;
@ -416,7 +401,7 @@ static std::wstring get_executable_path() {
if (last_slash != std::string::npos) { if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash); base_path = base_path.substr(0, last_slash);
} }
return utf8_to_utf16(base_path + "/"); return base_path + "/";
#elif defined(__linux__) || defined(__FreeBSD__) #elif defined(__linux__) || defined(__FreeBSD__)
std::string base_path = "."; std::string base_path = ".";
std::vector<char> path(1024); std::vector<char> path(1024);
@ -442,63 +427,57 @@ static std::wstring get_executable_path() {
path.resize(path.size() * 2); path.resize(path.size() * 2);
} }
return utf8_to_utf16(base_path + "/"); return base_path + "/";
#elif defined(_WIN32) #elif defined(_WIN32)
std::vector<wchar_t> path(MAX_PATH); std::vector<char> path(MAX_PATH);
DWORD len = GetModuleFileNameW(NULL, path.data(), path.size()); DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
if (len == 0) { if (len == 0) {
return {}; return "";
} }
std::wstring base_path(path.data(), len); std::string base_path(path.data(), len);
// remove executable name // remove executable name
auto last_slash = base_path.find_last_of('\\'); auto last_slash = base_path.find_last_of('\\');
if (last_slash != std::string::npos) { if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash); base_path = base_path.substr(0, last_slash);
} }
return base_path + L"\\"; return base_path + "\\";
#else
return {};
#endif #endif
} }
static std::wstring backend_filename_prefix() { static std::string backend_filename_prefix() {
#ifdef _WIN32 #ifdef _WIN32
return L"ggml-"; return "ggml-";
#else #else
return L"libggml-"; return "libggml-";
#endif #endif
} }
static std::wstring backend_filename_suffix() { static std::string backend_filename_suffix() {
#ifdef _WIN32 #ifdef _WIN32
return L".dll"; return ".dll";
#else #else
return L".so"; return ".so";
#endif
}
static std::wstring path_separator() {
#ifdef _WIN32
return L"\\";
#else
return L"/";
#endif #endif
} }
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) { static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
// TODO: search system paths // TODO: search system paths
std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-"; std::string file_prefix = backend_filename_prefix() + name + "-";
std::vector<std::wstring> search_paths; std::vector<std::string> search_paths;
if (user_search_path == nullptr) { if (user_search_path == nullptr) {
search_paths.push_back(L"." + path_separator()); search_paths.push_back("./");
search_paths.push_back(get_executable_path()); search_paths.push_back(get_executable_path());
} else { } else {
search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator()); #if defined(_WIN32)
search_paths.push_back(std::string(user_search_path) + "\\");
#else
search_paths.push_back(std::string(user_search_path) + "/");
#endif
} }
int best_score = 0; int best_score = 0;
std::wstring best_path; std::string best_path;
namespace fs = std::filesystem; namespace fs = std::filesystem;
for (const auto & search_path : search_paths) { for (const auto & search_path : search_paths) {
@ -508,27 +487,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied); fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
for (const auto & entry : dir_it) { for (const auto & entry : dir_it) {
if (entry.is_regular_file()) { if (entry.is_regular_file()) {
std::wstring filename = entry.path().filename().wstring(); std::string filename = entry.path().filename().string();
std::wstring ext = entry.path().extension().wstring(); std::string ext = entry.path().extension().string();
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) { if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
dl_handle_ptr handle { dl_load_library(entry.path().wstring()) }; dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
if (!handle && !silent) { if (!handle && !silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str()); GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
} }
if (handle) { if (handle) {
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
if (score_fn) { if (score_fn) {
int s = score_fn(); int s = score_fn();
#ifndef NDEBUG #ifndef NDEBUG
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s); GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
#endif #endif
if (s > best_score) { if (s > best_score) {
best_score = s; best_score = s;
best_path = entry.path().wstring(); best_path = entry.path().string();
} }
} else { } else {
if (!silent) { if (!silent) {
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str()); GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
} }
} }
} }
@ -540,15 +519,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
if (best_score == 0) { if (best_score == 0) {
// try to load the base backend // try to load the base backend
for (const auto & search_path : search_paths) { for (const auto & search_path : search_paths) {
std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix(); std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
if (fs::exists(path)) { if (fs::exists(path)) {
return get_reg().load_backend(path, silent); return get_reg().load_backend(path.c_str(), silent);
} }
} }
return nullptr; return nullptr;
} }
return get_reg().load_backend(best_path, silent); return get_reg().load_backend(best_path.c_str(), silent);
} }
void ggml_backend_load_all() { void ggml_backend_load_all() {

View File

@ -135,20 +135,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif() endif()
# show enabled features # show enabled features
if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
set(FEAT_INPUT_FILE "NUL")
else()
set(FEAT_INPUT_FILE "/dev/null")
endif()
execute_process( execute_process(
COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E - COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
INPUT_FILE ${FEAT_INPUT_FILE} INPUT_FILE "/dev/null"
OUTPUT_VARIABLE ARM_FEATURE OUTPUT_VARIABLE ARM_FEATURE
RESULT_VARIABLE ARM_FEATURE_RESULT RESULT_VARIABLE ARM_FEATURE_RESULT
) )
if (ARM_FEATURE_RESULT) if (ARM_FEATURE_RESULT)
message(WARNING "Failed to get ARM features") message(FATAL_ERROR "Failed to get ARM features")
else() else()
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC) foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos) string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
@ -323,11 +317,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS}) target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
if (GGML_BACKEND_DL) if (GGML_BACKEND_DL)
if (GGML_NATIVE)
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
endif()
# The feature detection code is compiled as a separate target so that # The feature detection code is compiled as a separate target so that
# it can be built without the architecture flags # it can be built without the architecture flags
# Since multiple variants of the CPU backend may be included in the same # Since multiple variants of the CPU backend may be included in the same

View File

@ -7419,14 +7419,14 @@ static void ggml_compute_forward_mul_mat(
if (src1_cont) { if (src1_cont) {
for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++) for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(params, if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
ne01, ne11, ne00/ggml_blck_size(src0->type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
nb01/ggml_type_size(src0->type), nb01/ggml_type_size(src0->type),
(const char *)src1->data + i12*nb12 + i13*nb13, (const char *)src1->data + i12*nb12 + i13*nb13,
nb11/ggml_type_size(src1->type), nb11/ggml_type_size(src1->type),
(char *)dst->data + i12*nb2 + i13*nb3, (char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type), nb1/ggml_type_size(dst->type),
ith, nth,
src0->type, src0->type,
src1->type, src1->type,
dst->type)) dst->type))
@ -7471,14 +7471,14 @@ UseGgmlGemm1:;
for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++) for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(params, if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
ne01, ne11, ne00/ggml_blck_size(src0->type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
nb01/ggml_type_size(src0->type), nb01/ggml_type_size(src0->type),
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size, (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
row_size/ggml_type_size(vec_dot_type), row_size/ggml_type_size(vec_dot_type),
(char *)dst->data + i12*nb2 + i13*nb3, (char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type), nb1/ggml_type_size(dst->type),
ith, nth,
src0->type, src0->type,
vec_dot_type, vec_dot_type,
dst->type)) dst->type))

View File

@ -53,8 +53,6 @@
#include "ggml-cpu-impl.h" #include "ggml-cpu-impl.h"
#include "ggml-quants.h" #include "ggml-quants.h"
#include <atomic>
#ifdef _MSC_VER #ifdef _MSC_VER
#define NOINLINE __declspec(noinline) #define NOINLINE __declspec(noinline)
#else #else
@ -136,16 +134,6 @@ inline __m512 madd(__m512 a, __m512 b, __m512 c) {
return _mm512_fmadd_ps(a, b, c); return _mm512_fmadd_ps(a, b, c);
} }
#endif #endif
#if defined(__AVX512BF16__)
template <>
inline __m512 madd(__m512bh a, __m512bh b, __m512 c) {
return _mm512_dpbf16_ps(c, a, b);
}
template <>
inline __m256 madd(__m256bh a, __m256bh b, __m256 c) {
return _mm256_dpbf16_ps(c, a, b);
}
#endif
#endif #endif
#if defined(__ARM_FEATURE_FMA) #if defined(__ARM_FEATURE_FMA)
@ -238,13 +226,6 @@ template <> inline __m256 load(const float *p) {
} }
#endif // __AVX__ #endif // __AVX__
#if defined(__AVX2__) || defined(__AVX512F__)
template <> inline __m256 load(const ggml_bf16_t *p) {
return _mm256_castsi256_ps(
_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16));
}
#endif // __AVX2__
#if defined(__F16C__) #if defined(__F16C__)
template <> inline __m256 load(const ggml_fp16_t *p) { template <> inline __m256 load(const ggml_fp16_t *p) {
return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p)); return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
@ -258,27 +239,8 @@ template <> inline __m512 load(const float *p) {
template <> inline __m512 load(const ggml_fp16_t *p) { template <> inline __m512 load(const ggml_fp16_t *p) {
return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p)); return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
} }
template <> inline __m512 load(const ggml_bf16_t *p) {
return _mm512_castsi512_ps(
_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16));
}
#endif // __AVX512F__ #endif // __AVX512F__
#if defined(__AVX512BF16__)
template <> inline __m512bh load(const ggml_bf16_t *p) {
return (__m512bh)_mm512_loadu_ps((const float *)p);
}
template <> inline __m256bh load(const ggml_bf16_t *p) {
return (__m256bh)_mm256_loadu_ps((const float *)p);
}
template <> inline __m512bh load(const float *p) {
return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
}
template <> inline __m256bh load(const float *p) {
return _mm512_cvtneps_pbh(_mm512_loadu_ps(p));
}
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// CONSTANTS // CONSTANTS
@ -290,170 +252,199 @@ static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// FLOATING POINT MATRIX MULTIPLICATION // FLOATING POINT MATRIX MULTIPLICATION
template <int M>
static inline int64_t BLOCK_SIZE(size_t m) {
const int64_t NB_BLOC_M = (m + M - 1) / M;
return (m % NB_BLOC_M == 0) ? m / NB_BLOC_M : (m / NB_BLOC_M) + 1;
}
static constexpr inline int64_t BLOC_POS(int64_t ib, int64_t ibN, int64_t bloc_size) {
return ib < ibN ? ib * bloc_size : ibN * bloc_size + (ib - ibN) * (bloc_size - 1);
}
template <int KN, typename D, typename V, typename TA, typename TB, typename TC> template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
class tinyBLAS { class tinyBLAS {
public: public:
tinyBLAS(const ggml_compute_params * params, int64_t k, tinyBLAS(int64_t k,
const TA *A, int64_t lda, const TA *A, int64_t lda,
const TB *B, int64_t ldb, const TB *B, int64_t ldb,
TC *C, int64_t ldc) TC *C, int64_t ldc,
: params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) { int ith, int nth)
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
} }
bool matmul(int64_t m, int64_t n) { void matmul(int64_t m, int64_t n) {
if (k % KN != 0) mnpack(0, m, 0, n);
return false;
// compute RM for only need tile with size RM&RM-1
#if VECTOR_REGISTERS == 32
if (m % 16 == 0 && (m/16 >= params->nth)) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 4>(m, n, SIZE_N, 12);
return true;
}
if (m % 8 == 0 ) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 2>(m, n, SIZE_N, 12);
return true;
}
if (m % 4 == 0) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 1>(m, n, SIZE_N, 12);
return true;
}
#else // VECTOR_REGISTERS == 16
if (m % 16 == 0 && (m/16 >= params->nth)) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 4>(m, n, SIZE_N, 24);
return true;
}
if (m % 8 == 0 ) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 2>(m, n, SIZE_N, 24);
return true;
}
if (m % 4 == 0) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 1>(m, n, SIZE_N, 24);
return true;
}
#endif
return false;
} }
private: private:
template <int RM, int RN, int BM> NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) { int64_t mc, nc, mp, np;
if (SIZE_N == RN) { switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
return gemm<RM, RN, BM>(m, n, BN); #if VECTOR_REGISTERS == 32
} case 0x55:
if constexpr (RN > 1) { mc = 5;
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN); nc = 5;
} else { gemm<5, 5>(m0, m, n0, n);
GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N); break;
GGML_ASSERT(false); // we have miss something. case 0x45:
mc = 4;
nc = 5;
gemm<4, 5>(m0, m, n0, n);
break;
case 0x54:
mc = 5;
nc = 4;
gemm<5, 4>(m0, m, n0, n);
break;
case 0x44:
mc = 4;
nc = 4;
gemm<4, 4>(m0, m, n0, n);
break;
case 0x53:
mc = 5;
nc = 3;
gemm<5, 3>(m0, m, n0, n);
break;
case 0x35:
mc = 3;
nc = 5;
gemm<3, 5>(m0, m, n0, n);
break;
case 0x43:
mc = 4;
nc = 3;
gemm<4, 3>(m0, m, n0, n);
break;
#else
case 0x55:
case 0x54:
case 0x53:
case 0x45:
case 0x44:
case 0x43:
mc = 4;
nc = 3;
gemm<4, 3>(m0, m, n0, n);
break;
case 0x35:
#endif
case 0x34:
mc = 3;
nc = 4;
gemm<3, 4>(m0, m, n0, n);
break;
case 0x52:
mc = 5;
nc = 2;
gemm<5, 2>(m0, m, n0, n);
break;
case 0x33:
mc = 3;
nc = 3;
gemm<3, 3>(m0, m, n0, n);
break;
case 0x25:
mc = 2;
nc = 5;
gemm<2, 5>(m0, m, n0, n);
break;
case 0x42:
mc = 4;
nc = 2;
gemm<4, 2>(m0, m, n0, n);
break;
case 0x24:
mc = 2;
nc = 4;
gemm<2, 4>(m0, m, n0, n);
break;
case 0x32:
mc = 3;
nc = 2;
gemm<3, 2>(m0, m, n0, n);
break;
case 0x23:
mc = 2;
nc = 3;
gemm<2, 3>(m0, m, n0, n);
break;
case 0x51:
mc = 5;
nc = 1;
gemm<5, 1>(m0, m, n0, n);
break;
case 0x41:
mc = 4;
nc = 1;
gemm<4, 1>(m0, m, n0, n);
break;
case 0x22:
mc = 2;
nc = 2;
gemm<2, 2>(m0, m, n0, n);
break;
case 0x15:
mc = 1;
nc = 5;
gemm<1, 5>(m0, m, n0, n);
break;
case 0x14:
mc = 1;
nc = 4;
gemm<1, 4>(m0, m, n0, n);
break;
case 0x31:
mc = 3;
nc = 1;
gemm<3, 1>(m0, m, n0, n);
break;
case 0x13:
mc = 1;
nc = 3;
gemm<1, 3>(m0, m, n0, n);
break;
case 0x21:
mc = 2;
nc = 1;
gemm<2, 1>(m0, m, n0, n);
break;
case 0x12:
mc = 1;
nc = 2;
gemm<1, 2>(m0, m, n0, n);
break;
case 0x11:
mc = 1;
nc = 1;
gemm<1, 1>(m0, m, n0, n);
break;
default:
return;
} }
mp = m0 + (m - m0) / mc * mc;
np = n0 + (n - n0) / nc * nc;
mnpack(mp, m, n0, np);
mnpack(m0, m, np, n);
} }
template <int RM, int RN> template <int RM, int RN>
inline void gemm_bloc(int64_t ii, int64_t jj) { NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int64_t ytiles = (m - m0) / RM;
int64_t xtiles = (n - n0) / RN;
int64_t tiles = xtiles * ytiles;
int64_t duty = (tiles + nth - 1) / nth;
int64_t start = duty * ith;
int64_t end = start + duty;
if (end > tiles)
end = tiles;
for (int64_t job = start; job < end; ++job) {
int64_t ii = m0 + job / xtiles * RM;
int64_t jj = n0 + job % xtiles * RN;
D Cv[RN][RM] = {}; D Cv[RN][RM] = {};
for (int64_t l = 0; l < k; l += KN) { for (int64_t l = 0; l < k; l += KN)
// help compiler for op order. for (int64_t j = 0; j < RN; ++j)
if constexpr (RM <= RN) { for (int64_t i = 0; i < RM; ++i)
V Av[RM]; Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
for (int64_t i = 0; i < RM; ++i) { load<V>(B + ldb * (jj + j) + l),
Av[i] = load<V>(A + lda * (ii + i) + l); Cv[j][i]);
}
for (int64_t j = 0; j < RN; ++j) {
V Bv = load<V>(B + ldb * (jj + j) + l);
for (int64_t i = 0; i < RM; ++i) {
Cv[j][i] = madd(Av[i], Bv, Cv[j][i]);
}
}
} else {
V Bv[RN];
for (int64_t j = 0; j < RN; ++j) {
Bv[j] = load<V>(B + ldb * (jj + j) + l);
}
for (int64_t i = 0; i < RM; ++i) {
V Av = load<V>(A + lda * (ii + i) + l);
for (int64_t j = 0; j < RN; ++j) {
Cv[j][i] = madd(Av, Bv[j], Cv[j][i]);
}
}
}
}
for (int64_t j = 0; j < RN; ++j) for (int64_t j = 0; j < RN; ++j)
for (int64_t i = 0; i < RM; ++i) for (int64_t i = 0; i < RM; ++i)
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]); C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
} }
template <int RM, int RN, int BM>
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
static std::atomic<int64_t> current_chunk;
GGML_ASSERT(m % (RM * BM) == 0);
const int64_t ytiles = m / (RM * BM);
const int64_t xtiles = (n + RN -1) / RN;
const int64_t jj_RN = (xtiles - (xtiles * RN - n));
// "round" bloc_size to "nearest" BN
const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
const int64_t nb_job = ytiles * NB_BN;
if (params->ith == 0) {
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
} }
ggml_barrier(params->threadpool);
int64_t job = params->ith;
while (job < nb_job) {
const int64_t ii = (job % ytiles) * RM * BM;
const int64_t jb = job / ytiles;
const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN);
const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
for (int64_t bi = 0; bi < BM * RM; bi += RM) {
int64_t jj = jj0;
for (; jj < jj1; jj += RN) {
gemm_bloc<RM, RN>(ii + bi, jj);
}
if constexpr (RN > 1) {
for (; jj < jj2; jj += RN - 1) {
gemm_bloc<RM, RN-1>(ii + bi, jj);
}
}
GGML_ASSERT(jj == jj2);
}
// next step.
job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
}
ggml_barrier(params->threadpool);
return;
}
const ggml_compute_params * params;
const TA *const A; const TA *const A;
const TB *const B; const TB *const B;
TC *const C; TC *const C;
@ -461,6 +452,8 @@ class tinyBLAS {
const int64_t lda; const int64_t lda;
const int64_t ldb; const int64_t ldb;
const int64_t ldc; const int64_t ldc;
const int ith;
const int nth;
}; };
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
@ -1664,9 +1657,8 @@ class tinyBLAS_PPC {
* @param Ctype is GGML data type of `C` * @param Ctype is GGML data type of `C`
* @return true if this function was able to service the matmul request * @return true if this function was able to service the matmul request
*/ */
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k, bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
const void *A, int64_t lda, const void *B, int64_t ldb, void *C, int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
int64_t ldc, int Atype, int Btype, int Ctype) {
assert(m >= 0); assert(m >= 0);
assert(n >= 0); assert(n >= 0);
@ -1674,8 +1666,8 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
assert(lda >= k); assert(lda >= k);
assert(ldb >= k); assert(ldb >= k);
assert(ldc >= m); assert(ldc >= m);
assert(params->nth > 0); assert(nth > 0);
assert(params->ith < params->nth); assert(ith < nth);
// only enable sgemm for prompt processing // only enable sgemm for prompt processing
if (n < 2) if (n < 2)
@ -1690,25 +1682,37 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
if (Btype != GGML_TYPE_F32) if (Btype != GGML_TYPE_F32)
return false; return false;
#if defined(__AVX512F__) #if defined(__AVX512F__)
tinyBLAS<16, __m512, __m512, float, float, float> tb{ params, if (k % 16)
return false;
tinyBLAS<16, __m512, __m512, float, float, float> tb{
k, (const float *)A, lda, k, (const float *)A, lda,
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc}; (float *)C, ldc,
return tb.matmul(m, n); ith, nth};
tb.matmul(m, n);
return true;
#elif defined(__AVX__) || defined(__AVX2__) #elif defined(__AVX__) || defined(__AVX2__)
tinyBLAS<8, __m256, __m256, float, float, float> tb{ params, if (k % 8)
return false;
tinyBLAS<8, __m256, __m256, float, float, float> tb{
k, (const float *)A, lda, k, (const float *)A, lda,
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc}; (float *)C, ldc,
return tb.matmul(m, n); ith, nth};
tb.matmul(m, n);
return true;
#elif defined(__ARM_NEON) #elif defined(__ARM_NEON)
if (n < 4) if (n < 4)
return false; return false;
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params, if (k % 4)
return false;
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
k, (const float *)A, lda, k, (const float *)A, lda,
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc}; (float *)C, ldc,
return tb.matmul(m, n); ith, nth};
tb.matmul(m, n);
return true;
#elif defined(__MMA__) #elif defined(__MMA__)
if (k % 8) if (k % 8)
return false; return false;
@ -1716,7 +1720,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
k, (const float *)A, lda, k, (const float *)A, lda,
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
params->ith, params->nth}; ith, nth};
tb.matmul(m, n); tb.matmul(m, n);
return true; return true;
#else #else
@ -1724,71 +1728,60 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
#endif #endif
} }
case GGML_TYPE_BF16: {
#if defined(__AVX512BF16__)
if (Btype == GGML_TYPE_BF16) {
tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
(const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif defined(__AVX512F__)
if (Btype == GGML_TYPE_BF16) {
tinyBLAS<16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
(const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif defined(__AVX2__)
if (Btype == GGML_TYPE_BF16) {
tinyBLAS<8, __m256, __m256, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
(const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#endif
return false;
}
case GGML_TYPE_F16: { case GGML_TYPE_F16: {
#if defined(__AVX512F__) #if defined(__AVX512F__)
if (Btype == GGML_TYPE_F16) { if (k % 16)
tinyBLAS<16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k, return false;
(const ggml_fp16_t *)A, lda, if (Btype != GGML_TYPE_F32)
(const ggml_fp16_t *)B, ldb, return false;
(float *)C, ldc}; tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
return tb.matmul(m, n); k, (const ggml_fp16_t *)A, lda,
} (const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__) #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
if (Btype == GGML_TYPE_F16) { if (k % 8)
tinyBLAS<8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k, return false;
(const ggml_fp16_t *)A, lda, if (Btype != GGML_TYPE_F32)
(const ggml_fp16_t *)B, ldb, return false;
(float *)C, ldc}; tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
return tb.matmul(m, n); k, (const ggml_fp16_t *)A, lda,
} (const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
if (n < 8) if (n < 8)
return false; return false;
if (Btype == GGML_TYPE_F16) { if (k % 8)
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params, return false;
if (Btype != GGML_TYPE_F16)
return false;
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
k, (const ggml_fp16_t *)A, lda, k, (const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb, (const ggml_fp16_t *)B, ldb,
(float *)C, ldc}; (float *)C, ldc,
return tb.matmul(m, n); ith, nth};
} tb.matmul(m, n);
return true;
#elif defined(__ARM_NEON) && !defined(_MSC_VER) #elif defined(__ARM_NEON) && !defined(_MSC_VER)
if (Btype == GGML_TYPE_F32) { if (k % 4)
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{ params, return false;
if (Btype != GGML_TYPE_F32)
return false;
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
k, (const ggml_fp16_t *)A, lda, k, (const ggml_fp16_t *)A, lda,
(const float *)B, ldb, (const float *)B, ldb,
(float *)C, ldc}; (float *)C, ldc,
return tb.matmul(m, n); ith, nth};
} tb.matmul(m, n);
#endif return true;
#else
return false; return false;
#endif
} }
case GGML_TYPE_Q8_0: { case GGML_TYPE_Q8_0: {
@ -1799,7 +1792,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
k, (const block_q8_0 *)A, lda, k, (const block_q8_0 *)A, lda,
(const block_q8_0 *)B, ldb, (const block_q8_0 *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
params->ith, params->nth}; ith, nth};
tb.matmul(m, n); tb.matmul(m, n);
return true; return true;
#elif defined(__ARM_FEATURE_DOTPROD) #elif defined(__ARM_FEATURE_DOTPROD)
@ -1807,7 +1800,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
k, (const block_q8_0 *)A, lda, k, (const block_q8_0 *)A, lda,
(const block_q8_0 *)B, ldb, (const block_q8_0 *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
params->ith, params->nth}; ith, nth};
tb.matmul(m, n); tb.matmul(m, n);
return true; return true;
#else #else
@ -1823,7 +1816,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
k, (const block_q4_0 *)A, lda, k, (const block_q4_0 *)A, lda,
(const block_q8_0 *)B, ldb, (const block_q8_0 *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
params->ith, params->nth}; ith, nth};
tb.matmul(m, n); tb.matmul(m, n);
return true; return true;
#elif defined(__ARM_FEATURE_DOTPROD) #elif defined(__ARM_FEATURE_DOTPROD)
@ -1831,7 +1824,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
k, (const block_q4_0 *)A, lda, k, (const block_q4_0 *)A, lda,
(const block_q8_0 *)B, ldb, (const block_q8_0 *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
params->ith, params->nth}; ith, nth};
tb.matmul(m, n); tb.matmul(m, n);
return true; return true;
#else #else
@ -1847,7 +1840,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
k, (const block_q5_0 *)A, lda, k, (const block_q5_0 *)A, lda,
(const block_q8_0 *)B, ldb, (const block_q8_0 *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
params->ith, params->nth}; ith, nth};
tb.matmul(m, n); tb.matmul(m, n);
return true; return true;
#else #else
@ -1863,7 +1856,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
k, (const block_iq4_nl *)A, lda, k, (const block_iq4_nl *)A, lda,
(const block_q8_0 *)B, ldb, (const block_q8_0 *)B, ldb,
(float *)C, ldc, (float *)C, ldc,
params->ith, params->nth}; ith, nth};
tb.matmul(m, n); tb.matmul(m, n);
return true; return true;
#else #else
@ -1875,7 +1868,6 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
return false; return false;
} }
(void)params;
(void)m; (void)m;
(void)n; (void)n;
(void)k; (void)k;
@ -1885,6 +1877,8 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
(void)ldb; (void)ldb;
(void)C; (void)C;
(void)ldc; (void)ldc;
(void)ith;
(void)nth;
(void)Atype; (void)Atype;
(void)Btype; (void)Btype;
(void)Ctype; (void)Ctype;

View File

@ -5,8 +5,8 @@
extern "C" { extern "C" {
#endif #endif
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t, bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
const void *, int64_t, const void *, int64_t, void *, int64_t, const void *, int64_t, void *, int64_t, int, int,
int, int, int); int, int, int);
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -126,8 +126,6 @@ connection = sqlite3.connect(input_file)
cursor = connection.cursor() cursor = connection.cursor()
builds = cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall() builds = cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
commit_short_len = len(builds[0][0])
try: try:
repo = git.Repo(".", search_parent_directories=True) repo = git.Repo(".", search_parent_directories=True)
except git.InvalidGitRepositoryError: except git.InvalidGitRepositoryError:
@ -140,11 +138,11 @@ def find_parent_in_data(commit: git.Commit):
seen_hexsha8 = set() seen_hexsha8 = set()
while heap: while heap:
depth, current_commit = heapq.heappop(heap) depth, current_commit = heapq.heappop(heap)
current_hexsha8 = commit.hexsha[:commit_short_len] current_hexsha8 = commit.hexsha[:8]
if (current_hexsha8,) in builds: if (current_hexsha8,) in builds:
return current_hexsha8 return current_hexsha8
for parent in commit.parents: for parent in commit.parents:
parent_hexsha8 = parent.hexsha[:commit_short_len] parent_hexsha8 = parent.hexsha[:8]
if parent_hexsha8 not in seen_hexsha8: if parent_hexsha8 not in seen_hexsha8:
seen_hexsha8.add(parent_hexsha8) seen_hexsha8.add(parent_hexsha8)
heapq.heappush(heap, (depth + 1, parent)) heapq.heappush(heap, (depth + 1, parent))
@ -158,9 +156,9 @@ def get_all_parent_hexsha8s(commit: git.Commit):
while unvisited: while unvisited:
current_commit = unvisited.pop(0) current_commit = unvisited.pop(0)
visited.append(current_commit.hexsha[:commit_short_len]) visited.append(current_commit.hexsha[:8])
for parent in current_commit.parents: for parent in current_commit.parents:
if parent.hexsha[:commit_short_len] not in visited: if parent.hexsha[:8] not in visited:
unvisited.append(parent) unvisited.append(parent)
return visited return visited
@ -171,10 +169,10 @@ def get_commit_name(hexsha8):
if repo is None: if repo is None:
return hexsha8 return hexsha8
for h in repo.heads: for h in repo.heads:
if h.commit.hexsha[:commit_short_len] == hexsha8: if h.commit.hexsha[:8] == hexsha8:
return h.name return h.name
for t in repo.tags: for t in repo.tags:
if t.commit.hexsha[:commit_short_len] == hexsha8: if t.commit.hexsha[:8] == hexsha8:
return t.name return t.name
return hexsha8 return hexsha8
@ -185,13 +183,13 @@ def get_commit_hexsha8(name):
return None return None
for h in repo.heads: for h in repo.heads:
if h.name == name: if h.name == name:
return h.commit.hexsha[:commit_short_len] return h.commit.hexsha[:8]
for t in repo.tags: for t in repo.tags:
if t.name == name: if t.name == name:
return t.commit.hexsha[:commit_short_len] return t.commit.hexsha[:8]
for c in repo.iter_commits("--all"): for c in repo.iter_commits("--all"):
if c.hexsha[:commit_short_len] == name[:commit_short_len]: if c.hexsha[:8] == name[:8]:
return c.hexsha[:commit_short_len] return c.hexsha[:8]
return None return None

View File

@ -26,7 +26,7 @@ function has_cmd {
} }
if has_cmd wget; then if has_cmd wget; then
cmd="wget -q -c -O %s/%s %s" cmd="wget -q --show-progress -c -O %s/%s %s"
elif has_cmd curl; then elif has_cmd curl; then
cmd="curl -C - -f --output-dir %s -o %s -L %s" cmd="curl -C - -f --output-dir %s -o %s -L %s"
else else

View File

@ -1657,7 +1657,7 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
} }
llama_token llama_token_bos_impl(const struct llama_vocab & vocab) { llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id; return vocab.special_bos_id;
} }
llama_token llama_token_eos_impl(const struct llama_vocab & vocab) { llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {

View File

@ -45,7 +45,7 @@ struct llama_vocab {
id special_unk_id = 0; id special_unk_id = 0;
id special_sep_id = LLAMA_TOKEN_NULL; id special_sep_id = LLAMA_TOKEN_NULL;
id special_pad_id = LLAMA_TOKEN_NULL; id special_pad_id = LLAMA_TOKEN_NULL;
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930 id special_cls_id = LLAMA_TOKEN_NULL;
id special_mask_id = LLAMA_TOKEN_NULL; id special_mask_id = LLAMA_TOKEN_NULL;
id linefeed_id = 13; id linefeed_id = 13;