shadow : cont gcc

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-01-12 16:09:49 +02:00
parent f65e3d324d
commit 10eb87409e
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
12 changed files with 509 additions and 511 deletions

File diff suppressed because it is too large Load Diff

View File

@ -579,8 +579,8 @@ private:
seq.back().second = false; seq.back().second = false;
} else { } else {
std::string literal; std::string literal;
auto is_non_literal = [&](char c) { auto is_non_literal = [&](char ch) {
return NON_LITERAL_SET.find(c) != NON_LITERAL_SET.end(); return NON_LITERAL_SET.find(ch) != NON_LITERAL_SET.end();
}; };
while (i < length) { while (i < length) {
if (sub_pattern[i] == '\\' && i < length - 1) { if (sub_pattern[i] == '\\' && i < length - 1) {

View File

@ -255,8 +255,8 @@ public:
thrd = std::thread([this]() { thrd = std::thread([this]() {
while (true) { while (true) {
{ {
std::unique_lock<std::mutex> lock(mtx); std::unique_lock<std::mutex> lock_thrd(mtx);
cv.wait(lock, [this]() { return head != tail; }); cv.wait(lock_thrd, [this]() { return head != tail; });
cur = entries[head]; cur = entries[head];

View File

@ -62,7 +62,7 @@ int main(int argc, char ** argv) {
llama_batch batch = llama_batch_init(n_kv_max, 0, 1); llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
// decode in batches of ctx_params.n_batch tokens // decode in batches of ctx_params.n_batch tokens
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) { auto decode_helper = [&ctx, &batch](int32_t n_batch) {
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
@ -94,7 +94,7 @@ int main(int argc, char ** argv) {
common_batch_add(batch, 0, i, { 0 }, false); common_batch_add(batch, 0, i, { 0 }, false);
} }
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx_params.n_batch)) {
LOG_ERR("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
@ -134,7 +134,7 @@ int main(int argc, char ** argv) {
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx_params.n_batch)) {
LOG_ERR("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }
@ -156,7 +156,7 @@ int main(int argc, char ** argv) {
common_batch_add(batch, 0, pp + i, { j }, true); common_batch_add(batch, 0, pp + i, { j }, true);
} }
if (!decode_helper(ctx, batch, ctx_params.n_batch)) { if (!decode_helper(ctx_params.n_batch)) {
LOG_ERR("%s: llama_decode() failed\n", __func__); LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1; return 1;
} }

View File

@ -2082,7 +2082,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
} }
else if (ctx->has_qwen2vl_merger) { else if (ctx->has_qwen2vl_merger) {
clip_image_u8 * resized = clip_image_u8_init(); clip_image_u8 * resized = clip_image_u8_init();
auto patch_size = clip_patch_size(ctx) * 2; auto patch_size = clip_get_patch_size(ctx) * 2;
int nx = ceil((float)img->nx / patch_size) * patch_size; int nx = ceil((float)img->nx / patch_size) * patch_size;
int ny = ceil((float)img->ny / patch_size) * patch_size; int ny = ceil((float)img->ny / patch_size) * patch_size;
bicubic_resize(*img, *resized, nx, ny); bicubic_resize(*img, *resized, nx, ny);
@ -2293,15 +2293,15 @@ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
} }
int32_t clip_image_size(const struct clip_ctx * ctx) { int32_t clip_get_image_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.image_size; return ctx->vision_model.hparams.image_size;
} }
int32_t clip_patch_size(const struct clip_ctx * ctx) { int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.patch_size; return ctx->vision_model.hparams.patch_size;
} }
int32_t clip_hidden_size(const struct clip_ctx * ctx) { int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.hidden_size; return ctx->vision_model.hparams.hidden_size;
} }

View File

@ -47,9 +47,9 @@ CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w); CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx); CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
// TODO: should be enum, not string // TODO: should be enum, not string
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);

View File

@ -105,8 +105,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
struct ggml_context * ctx; struct ggml_context * ctx;
} model; } model;
const int32_t image_size = clip_image_size(ctx_clip); const int32_t image_size = clip_get_image_size(ctx_clip);
const int32_t patch_size = clip_patch_size(ctx_clip); const int32_t patch_size = clip_get_patch_size(ctx_clip);
int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
@ -353,7 +353,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
img_res_v.size = 0; img_res_v.size = 0;
img_res_v.data = nullptr; img_res_v.data = nullptr;
const int32_t image_size = clip_image_size(ctx_clip); const int32_t image_size = clip_get_image_size(ctx_clip);
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);

View File

@ -3702,8 +3702,8 @@ int main(int argc, char ** argv) {
ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool { ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool {
json res_json = result->to_json(); json res_json = result->to_json();
if (res_json.is_array()) { if (res_json.is_array()) {
for (const auto & res : res_json) { for (const auto & item : res_json) {
if (!server_sent_event(sink, "data", res)) { if (!server_sent_event(sink, "data", item)) {
return false; return false;
} }
} }
@ -3973,9 +3973,9 @@ int main(int argc, char ** argv) {
std::unordered_set<int> task_ids = server_task::get_list_id(tasks); std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) { ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
for (auto & res : results) { for (auto & result : results) {
GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr); GGML_ASSERT(dynamic_cast<server_task_result_embd*>(result.get()) != nullptr);
responses.push_back(res->to_json()); responses.push_back(result->to_json());
} }
}, [&](const json & error_data) { }, [&](const json & error_data) {
res_error(res, error_data); res_error(res, error_data);
@ -4063,9 +4063,9 @@ int main(int argc, char ** argv) {
std::unordered_set<int> task_ids = server_task::get_list_id(tasks); std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) { ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
for (auto & res : results) { for (auto & result : results) {
GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr); GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(result.get()) != nullptr);
responses.push_back(res->to_json()); responses.push_back(result->to_json());
} }
}, [&](const json & error_data) { }, [&](const json & error_data) {
res_error(res, error_data); res_error(res, error_data);

View File

@ -110,9 +110,8 @@ int main(int argc, char ** argv) {
llama_token new_token_id; llama_token new_token_id;
while (true) { while (true) {
// check if we have enough space in the context to evaluate this batch // check if we have enough space in the context to evaluate this batch
int n_ctx = llama_n_ctx(ctx);
int n_ctx_used = llama_get_kv_cache_used_cells(ctx); int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
if (n_ctx_used + batch.n_tokens > n_ctx) { if (n_ctx_used + batch.n_tokens > (int) llama_n_ctx(ctx)) {
printf("\033[0m\n"); printf("\033[0m\n");
fprintf(stderr, "context size exceeded\n"); fprintf(stderr, "context size exceeded\n");
exit(0); exit(0);

View File

@ -311,9 +311,9 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_m
ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type"); ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
if (ggml_backend_split_buffer_type_fn) { if (ggml_backend_split_buffer_type_fn) {
size_t dev_index = [&]() { size_t dev_index = [&]() {
auto * reg = ggml_backend_dev_backend_reg(dev); ggml_backend_reg_t reg_dev = ggml_backend_dev_backend_reg(dev);
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) { for (size_t i = 0; i < ggml_backend_reg_dev_count(reg_dev); ++i) {
if (ggml_backend_reg_dev_get(reg, i) == dev) { if (ggml_backend_reg_dev_get(reg_dev, i) == dev) {
return i; return i;
} }
} }
@ -1304,7 +1304,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1); const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
return {cpu_dev, &pimpl->cpu_buft_list}; return { cpu_dev, &pimpl->cpu_buft_list };
} }
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin(); const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
auto * dev = devices.at(layer_gpu); auto * dev = devices.at(layer_gpu);
@ -1453,7 +1453,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// avoid using a host buffer when using mmap // avoid using a host buffer when using mmap
auto * buft_dev = ggml_backend_buft_get_device(buft); auto * buft_dev = ggml_backend_buft_get_device(buft);
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) { if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
buft = ggml_backend_dev_buffer_type(cpu_dev); buft = ggml_backend_dev_buffer_type(cpu_dev);
} }
@ -3697,8 +3696,8 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
const struct ggml_tensor * llama_model::get_tensor(const char * name) const { const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(), auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
[name](const std::pair<std::string, struct ggml_tensor *> & it) { [name](const std::pair<std::string, struct ggml_tensor *> & entry) {
return it.first == name; return entry.first == name;
}); });
if (it == tensors_by_name.end()) { if (it == tensors_by_name.end()) {
return nullptr; return nullptr;

View File

@ -130,17 +130,17 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2; return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
}; };
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name_layer) {
if (n_expert > 1) { if (n_expert > 1) {
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
// for getting the current layer as I initially thought, and we need to resort to parsing the // for getting the current layer as I initially thought, and we need to resort to parsing the
// tensor name. // tensor name.
if (sscanf(name, "blk.%d.", &i_layer) != 1) { if (sscanf(name_layer, "blk.%d.", &i_layer) != 1) {
throw std::runtime_error(format("Failed to determine layer for tensor %s", name)); throw std::runtime_error(format("Failed to determine layer for tensor %s", name_layer));
} }
if (i_layer < 0 || i_layer >= n_layer) { if (i_layer < 0 || i_layer >= n_layer) {
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer)); throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name_layer, n_layer));
} }
} }
return std::make_pair(i_layer, n_layer); return std::make_pair(i_layer, n_layer);

View File

@ -2496,15 +2496,15 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
// copy piece chars to output text buffer // copy piece chars to output text buffer
// skip up to 'lstrip' leading spaces before copying // skip up to 'lstrip' leading spaces before copying
auto _try_copy = [=] (const char * token, size_t size) -> int32_t { auto _try_copy = [=] (const char * text, size_t size) -> int32_t {
for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) { for (int32_t i = 0; i < lstrip && size && *text == ' '; ++i) {
token++; text++;
size--; size--;
} }
if (length < (int32_t)size) { if (length < (int32_t)size) {
return -(int32_t) size; return -(int32_t) size;
} }
memcpy(buf, token, size); memcpy(buf, text, size);
return (int32_t) size; return (int32_t) size;
}; };