Merge remote-tracking branch 'origin/master' into bins

This commit is contained in:
Olivier Chafik 2024-06-08 12:04:52 +01:00
commit fe93cc96cc
11 changed files with 219 additions and 95 deletions

View File

@ -84,4 +84,4 @@ endif ()
target_include_directories(${TARGET} PUBLIC .) target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11) target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama) target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)

View File

@ -1491,6 +1491,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.chat_template = argv[i]; params.chat_template = argv[i];
return true; return true;
} }
if (arg == "--slot-prompt-similarity" || arg == "-sps") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.slot_prompt_similarity = std::stof(argv[i]);
return true;
}
if (arg == "-pps") { if (arg == "-pps") {
params.is_pp_shared = true; params.is_pp_shared = true;
return true; return true;
@ -1913,6 +1921,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"set custom jinja chat template (default: template taken from model's metadata)\n" "set custom jinja chat template (default: template taken from model's metadata)\n"
"only commonly used templates are accepted:\n" "only commonly used templates are accepted:\n"
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
options.push_back({ "logging" }); options.push_back({ "logging" });

View File

@ -203,6 +203,8 @@ struct gpt_params {
std::string slot_save_path; std::string slot_save_path;
float slot_prompt_similarity = 0.5f;
// batched-bench params // batched-bench params
bool is_pp_shared = false; bool is_pp_shared = false;

View File

@ -61,10 +61,10 @@ static size_t split_str_to_n_bytes(std::string str) {
int n; int n;
if (str.back() == 'M') { if (str.back() == 'M') {
sscanf(str.c_str(), "%d", &n); sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1024 * 1024; // megabytes n_bytes = (size_t)n * 1000 * 1000; // megabytes
} else if (str.back() == 'G') { } else if (str.back() == 'G') {
sscanf(str.c_str(), "%d", &n); sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
} else { } else {
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back())); throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
} }
@ -284,7 +284,7 @@ struct split_strategy {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i)); struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
total_size += ggml_nbytes(t); total_size += ggml_nbytes(t);
} }
total_size = total_size / 1024 / 1024; // convert to megabytes total_size = total_size / 1000 / 1000; // convert to megabytes
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
i_split++; i_split++;
} }

View File

@ -151,6 +151,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
for (int j = 0; j < (int)src1->ne[0]; ++j) { for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[e_start + j] += x[j]*x[j]; e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++; e.counts[e_start + j]++;
if (!std::isfinite(e.values[e_start + j])) {
fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
exit(1);
}
} }
} }
} }
@ -183,6 +187,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
for (int j = 0; j < (int)src1->ne[0]; ++j) { for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j]; e.values[j] += x[j]*x[j];
e.counts[j]++; e.counts[j]++;
if (!std::isfinite(e.values[j])) {
fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
exit(1);
}
} }
} }
if (e.ncall > m_last_call) { if (e.ncall > m_last_call) {

View File

@ -279,7 +279,7 @@ node index.js
`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1`
`cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. Default: `false` `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)

View File

@ -647,6 +647,9 @@ struct server_context {
server_metrics metrics; server_metrics metrics;
// Necessary similarity of prompt for slot selection
float slot_prompt_similarity = 0.0f;
~server_context() { ~server_context() {
if (ctx) { if (ctx) {
llama_free(ctx); llama_free(ctx);
@ -795,24 +798,88 @@ struct server_context {
return prompt_tokens; return prompt_tokens;
} }
server_slot * get_slot(int id) { server_slot * get_slot_by_id(int id) {
int64_t t_last = ggml_time_us();
server_slot * last_used = nullptr;
for (server_slot & slot : slots) { for (server_slot & slot : slots) {
if (slot.id == id && slot.available()) { if (slot.id == id) {
return &slot; return &slot;
} }
// among all available slots, find the one that has been least recently used
if (slot.available() && slot.t_last_used < t_last) {
last_used = &slot;
t_last = slot.t_last_used;
}
} }
return last_used; return nullptr;
}
server_slot * get_available_slot(const std::string & prompt) {
server_slot * ret = nullptr;
// find the slot that has at least n% prompt similarity
if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
int max_lcp_len = 0;
float similarity = 0;
for (server_slot & slot : slots) {
// skip the slot if it is not available
if (!slot.available()) {
continue;
}
// skip the slot if it does not contains prompt
if (!slot.prompt.is_string()) {
continue;
}
// current slot's prompt
std::string slot_prompt = slot.prompt.get<std::string>();
// length of the current slot's prompt
int slot_prompt_len = slot_prompt.size();
// length of the Longest Common Prefix between the current slot's prompt and the input prompt
int lcp_len = common_part(slot_prompt, prompt);
// fraction of the common substring length compared to the current slot's prompt length
similarity = static_cast<float>(lcp_len) / slot_prompt_len;
// select the current slot if the criteria match
if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
max_lcp_len = lcp_len;
ret = &slot;
}
}
if (ret != nullptr) {
LOG_VERBOSE("selected slot by lcp similarity", {
{"id_slot", ret->id},
{"max_lcp_len", max_lcp_len},
{"similarity", similarity},
});
}
}
// find the slot that has been least recently used
if (ret == nullptr) {
int64_t t_last = ggml_time_us();
for (server_slot & slot : slots) {
// skip the slot if it is not available
if (!slot.available()) {
continue;
}
// select the current slot if the criteria match
if (slot.t_last_used < t_last) {
t_last = slot.t_last_used;
ret = &slot;
}
}
if (ret != nullptr) {
LOG_VERBOSE("selected slot by lru", {
{"id_slot", ret->id},
{"t_last", t_last},
});
}
}
return ret;
} }
bool launch_slot_with_task(server_slot & slot, const server_task & task) { bool launch_slot_with_task(server_slot & slot, const server_task & task) {
@ -888,7 +955,7 @@ struct server_context {
slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix); slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
// get prompt // get prompt
{ if (!task.infill) {
const auto & prompt = data.find("prompt"); const auto & prompt = data.find("prompt");
if (prompt == data.end()) { if (prompt == data.end()) {
send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST);
@ -1515,13 +1582,29 @@ struct server_context {
switch (task.type) { switch (task.type) {
case SERVER_TASK_TYPE_COMPLETION: case SERVER_TASK_TYPE_COMPLETION:
{ {
server_slot * slot = get_slot(json_value(task.data, "id_slot", -1)); int id_slot = json_value(task.data, "id_slot", -1);
std::string prompt = json_value(task.data, "prompt", std::string());
server_slot * slot;
if (id_slot != -1) {
slot = get_slot_by_id(id_slot);
} else {
slot = get_available_slot(prompt);
}
if (slot == nullptr) { if (slot == nullptr) {
// if no slot is available, we defer this task for processing later // if no slot is available, we defer this task for processing later
LOG_VERBOSE("no slot is available", {{"id_task", task.id}}); LOG_VERBOSE("no slot is available", {{"id_task", task.id}});
queue_tasks.defer(task); queue_tasks.defer(task);
break; break;
} }
if (!slot->available()) {
// if requested slot is unavailable, we defer this task for processing later
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
queue_tasks.defer(task);
break;
}
if (task.data.contains("system_prompt")) { if (task.data.contains("system_prompt")) {
std::string sys_prompt = json_value(task.data, "system_prompt", std::string()); std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
@ -1638,11 +1721,17 @@ struct server_context {
case SERVER_TASK_TYPE_SLOT_SAVE: case SERVER_TASK_TYPE_SLOT_SAVE:
{ {
int id_slot = task.data.at("id_slot"); int id_slot = task.data.at("id_slot");
server_slot * slot = get_slot(id_slot); server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) { if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break; break;
} }
if (!slot->available()) {
// if requested slot is unavailable, we defer this task for processing later
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
queue_tasks.defer(task);
break;
}
const size_t token_count = slot->cache_tokens.size(); const size_t token_count = slot->cache_tokens.size();
const int64_t t_start = ggml_time_us(); const int64_t t_start = ggml_time_us();
@ -1673,11 +1762,17 @@ struct server_context {
case SERVER_TASK_TYPE_SLOT_RESTORE: case SERVER_TASK_TYPE_SLOT_RESTORE:
{ {
int id_slot = task.data.at("id_slot"); int id_slot = task.data.at("id_slot");
server_slot * slot = get_slot(id_slot); server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) { if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break; break;
} }
if (!slot->available()) {
// if requested slot is unavailable, we defer this task for processing later
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
queue_tasks.defer(task);
break;
}
const int64_t t_start = ggml_time_us(); const int64_t t_start = ggml_time_us();
@ -1715,11 +1810,17 @@ struct server_context {
case SERVER_TASK_TYPE_SLOT_ERASE: case SERVER_TASK_TYPE_SLOT_ERASE:
{ {
int id_slot = task.data.at("id_slot"); int id_slot = task.data.at("id_slot");
server_slot * slot = get_slot(id_slot); server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) { if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break; break;
} }
if (!slot->available()) {
// if requested slot is unavailable, we defer this task for processing later
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
queue_tasks.defer(task);
break;
}
// Erase token cache // Erase token cache
const size_t n_erased = slot->cache_tokens.size(); const size_t n_erased = slot->cache_tokens.size();
@ -2467,6 +2568,9 @@ int main(int argc, char ** argv) {
log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded"; log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
} }
// Necessary similarity of prompt for slot selection
ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
// load the model // load the model
if (!ctx_server.load_model(params)) { if (!ctx_server.load_model(params)) {
state.store(SERVER_STATE_ERROR); state.store(SERVER_STATE_ERROR);

View File

@ -253,6 +253,13 @@ static size_t common_part(const std::vector<llama_token> & a, const std::vector<
return i; return i;
} }
static size_t common_part(const std::string & a, const std::string & b) {
size_t i;
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
return i;
}
static bool ends_with(const std::string & str, const std::string & suffix) { static bool ends_with(const std::string & str, const std::string & suffix) {
return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
} }

View File

@ -9108,6 +9108,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
// find the sum of exps in the block // find the sum of exps in the block
tmp = warp_reduce_sum(tmp, item_ct1); tmp = warp_reduce_sum(tmp, item_ct1);
if (block_size > WARP_SIZE) { if (block_size > WARP_SIZE) {
item_ct1.barrier(sycl::access::fence_space::local_space);
if (warp_id == 0) { if (warp_id == 0) {
buf[lane_id] = 0.f; buf[lane_id] = 0.f;
} }

View File

@ -345,15 +345,12 @@ struct vk_context {
}; };
struct ggml_tensor_extra_gpu { struct ggml_tensor_extra_gpu {
bool ready;
size_t ctx_idx; size_t ctx_idx;
vk_buffer_ref buffer_gpu; vk_buffer_ref buffer_gpu;
uint64_t offset; uint64_t offset;
void reset() { void reset() {
ready = false;
ctx_idx = 0; ctx_idx = 0;
buffer_gpu.reset(); buffer_gpu.reset();
offset = 0; offset = 0;
@ -2949,7 +2946,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
const uint64_t d_sz = sizeof(float) * d_ne; const uint64_t d_sz = sizeof(float) * d_ne;
vk_buffer d_D = extra->buffer_gpu.lock(); vk_buffer d_D = extra->buffer_gpu.lock();
const uint64_t d_buf_offset = extra->offset; const uint64_t d_buf_offset = extra->offset + dst->view_offs;
GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D != nullptr);
GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03); GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
vk_buffer d_X; vk_buffer d_X;
@ -2958,12 +2955,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
uint64_t y_buf_offset = 0; uint64_t y_buf_offset = 0;
if (!src0_uma) { if (!src0_uma) {
d_Qx = extra_src0->buffer_gpu.lock(); d_Qx = extra_src0->buffer_gpu.lock();
qx_buf_offset = extra_src0->offset; qx_buf_offset = extra_src0->offset + src0->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
} }
if (!src1_uma) { if (!src1_uma) {
d_Qy = extra_src1->buffer_gpu.lock(); d_Qy = extra_src1->buffer_gpu.lock();
qy_buf_offset = extra_src1->offset; qy_buf_offset = extra_src1->offset + src1->view_offs;
GGML_ASSERT(d_Qy != nullptr); GGML_ASSERT(d_Qy != nullptr);
} }
if (qx_needs_dequant) { if (qx_needs_dequant) {
@ -3114,7 +3111,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
const uint64_t d_sz = sizeof(float) * d_ne; const uint64_t d_sz = sizeof(float) * d_ne;
vk_buffer d_D = extra->buffer_gpu.lock(); vk_buffer d_D = extra->buffer_gpu.lock();
const uint64_t d_buf_offset = extra->offset; const uint64_t d_buf_offset = extra->offset + dst->view_offs;
GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D != nullptr);
vk_buffer d_X; vk_buffer d_X;
uint64_t x_buf_offset = 0; uint64_t x_buf_offset = 0;
@ -3122,12 +3119,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
uint64_t y_buf_offset = 0; uint64_t y_buf_offset = 0;
if(!src0_uma) { if(!src0_uma) {
d_Qx = extra_src0->buffer_gpu.lock(); d_Qx = extra_src0->buffer_gpu.lock();
qx_buf_offset = extra_src0->offset; qx_buf_offset = extra_src0->offset + src0->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
} }
if(!src1_uma) { if(!src1_uma) {
d_Qy = extra_src1->buffer_gpu.lock(); d_Qy = extra_src1->buffer_gpu.lock();
qy_buf_offset = extra_src1->offset; qy_buf_offset = extra_src1->offset + src1->view_offs;
GGML_ASSERT(d_Qy != nullptr); GGML_ASSERT(d_Qy != nullptr);
} }
if (qx_needs_dequant) { if (qx_needs_dequant) {
@ -3246,14 +3243,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
const uint64_t d_sz = sizeof(float) * d_ne; const uint64_t d_sz = sizeof(float) * d_ne;
vk_buffer d_D = extra->buffer_gpu.lock(); vk_buffer d_D = extra->buffer_gpu.lock();
const uint64_t d_buf_offset = extra->offset; const uint64_t d_buf_offset = extra->offset + dst->view_offs;
GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D != nullptr);
vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
const uint64_t qx_buf_offset = extra_src0->offset; const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
if (!src1_uma) { if (!src1_uma) {
d_Qy = extra_src1->buffer_gpu.lock(); d_Qy = extra_src1->buffer_gpu.lock();
qy_buf_offset = extra_src1->offset; qy_buf_offset = extra_src1->offset + src1->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
} }
@ -3323,14 +3320,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
const uint64_t d_sz = sizeof(float) * d_ne; const uint64_t d_sz = sizeof(float) * d_ne;
vk_buffer d_D = extra->buffer_gpu.lock(); vk_buffer d_D = extra->buffer_gpu.lock();
const uint64_t d_buf_offset = extra->offset; const uint64_t d_buf_offset = extra->offset + dst->view_offs;
GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D != nullptr);
vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
const uint64_t qx_buf_offset = extra_src0->offset; const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
if (!src1_uma) { if (!src1_uma) {
d_Qy = extra_src1->buffer_gpu.lock(); d_Qy = extra_src1->buffer_gpu.lock();
qy_buf_offset = extra_src1->offset; qy_buf_offset = extra_src1->offset + src1->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
} }
@ -3459,7 +3456,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
const uint64_t d_sz = sizeof(float) * d_ne; const uint64_t d_sz = sizeof(float) * d_ne;
vk_buffer d_D = extra->buffer_gpu.lock(); vk_buffer d_D = extra->buffer_gpu.lock();
const uint64_t d_buf_offset = extra->offset; const uint64_t d_buf_offset = extra->offset + dst->view_offs;
GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D != nullptr);
vk_buffer d_X; vk_buffer d_X;
uint64_t x_buf_offset = 0; uint64_t x_buf_offset = 0;
@ -3467,17 +3464,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
uint64_t y_buf_offset = 0; uint64_t y_buf_offset = 0;
if (!src0_uma) { if (!src0_uma) {
d_Qx = extra_src0->buffer_gpu.lock(); d_Qx = extra_src0->buffer_gpu.lock();
qx_buf_offset = extra_src0->offset; qx_buf_offset = extra_src0->offset + src0->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
} }
if (!src1_uma) { if (!src1_uma) {
d_Qy = extra_src1->buffer_gpu.lock(); d_Qy = extra_src1->buffer_gpu.lock();
qy_buf_offset = extra_src1->offset; qy_buf_offset = extra_src1->offset + src1->view_offs;
GGML_ASSERT(d_Qy != nullptr); GGML_ASSERT(d_Qy != nullptr);
} }
if (!ids_uma) { if (!ids_uma) {
d_ids = extra_ids->buffer_gpu.lock(); d_ids = extra_ids->buffer_gpu.lock();
ids_buf_offset = extra_ids->offset; ids_buf_offset = extra_ids->offset + ids->view_offs;
GGML_ASSERT(d_ids != nullptr); GGML_ASSERT(d_ids != nullptr);
} }
if (qx_needs_dequant) { if (qx_needs_dequant) {
@ -3636,7 +3633,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
const uint64_t d_sz = sizeof(float) * d_ne; const uint64_t d_sz = sizeof(float) * d_ne;
vk_buffer d_D = extra->buffer_gpu.lock(); vk_buffer d_D = extra->buffer_gpu.lock();
const uint64_t d_buf_offset = extra->offset; const uint64_t d_buf_offset = extra->offset + dst->view_offs;
GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D != nullptr);
vk_buffer d_X; vk_buffer d_X;
uint64_t x_buf_offset = 0; uint64_t x_buf_offset = 0;
@ -3644,17 +3641,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
uint64_t y_buf_offset = 0; uint64_t y_buf_offset = 0;
if(!src0_uma) { if(!src0_uma) {
d_Qx = extra_src0->buffer_gpu.lock(); d_Qx = extra_src0->buffer_gpu.lock();
qx_buf_offset = extra_src0->offset; qx_buf_offset = extra_src0->offset + src0->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
} }
if(!src1_uma) { if(!src1_uma) {
d_Qy = extra_src1->buffer_gpu.lock(); d_Qy = extra_src1->buffer_gpu.lock();
qy_buf_offset = extra_src1->offset; qy_buf_offset = extra_src1->offset + src1->view_offs;
GGML_ASSERT(d_Qy != nullptr); GGML_ASSERT(d_Qy != nullptr);
} }
if(!ids_uma) { if(!ids_uma) {
d_ids = extra_ids->buffer_gpu.lock(); d_ids = extra_ids->buffer_gpu.lock();
ids_buf_offset = extra_ids->offset; ids_buf_offset = extra_ids->offset + ids->view_offs;
GGML_ASSERT(d_ids != nullptr); GGML_ASSERT(d_ids != nullptr);
} }
if (qx_needs_dequant) { if (qx_needs_dequant) {
@ -3769,9 +3766,9 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
const vk_buffer src_buf = extra_src0->buffer_gpu.lock(); const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
const uint64_t src_offset = extra_src0->offset; const uint64_t src_offset = extra_src0->offset + src0->view_offs;
vk_buffer dst_buf = extra->buffer_gpu.lock(); vk_buffer dst_buf = extra->buffer_gpu.lock();
const uint64_t dst_offset = extra->offset; const uint64_t dst_offset = extra->offset + dst->view_offs;
std::vector<vk::BufferCopy> copies; std::vector<vk::BufferCopy> copies;
@ -4062,21 +4059,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
} }
GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D != nullptr);
uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
if(!src0_uma) { if(!src0_uma) {
d_X = extra_src0->buffer_gpu.lock(); d_X = extra_src0->buffer_gpu.lock();
x_buf_offset = extra_src0->offset; x_buf_offset = extra_src0->offset + src0->view_offs;
GGML_ASSERT(d_X != nullptr); GGML_ASSERT(d_X != nullptr);
} }
if (use_src1 && !src1_uma) { if (use_src1 && !src1_uma) {
d_Y = extra_src1->buffer_gpu.lock(); d_Y = extra_src1->buffer_gpu.lock();
y_buf_offset = extra_src1->offset; y_buf_offset = extra_src1->offset + src1->view_offs;
GGML_ASSERT(d_Y != nullptr); GGML_ASSERT(d_Y != nullptr);
} }
if (use_src2 && !src2_uma) { if (use_src2 && !src2_uma) {
d_Z = extra_src2->buffer_gpu.lock(); d_Z = extra_src2->buffer_gpu.lock();
z_buf_offset = extra_src2->offset; z_buf_offset = extra_src2->offset + src2->view_offs;
GGML_ASSERT(d_Z != nullptr); GGML_ASSERT(d_Z != nullptr);
} }
@ -4336,7 +4333,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type); const uint32_t dst_type_size = ggml_type_size(dst->type);
const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, { ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
(uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src0),
@ -5569,6 +5566,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
const ggml_tensor * src2 = node->src[2]; const ggml_tensor * src2 = node->src[2];
switch (node->op) { switch (node->op) {
// Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_NONE:
return;
case GGML_OP_UNARY: case GGML_OP_UNARY:
switch (ggml_get_unary_op(node)) { switch (ggml_get_unary_op(node)) {
case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_SILU:
@ -5590,10 +5594,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_CPY: case GGML_OP_CPY:
case GGML_OP_CONT: case GGML_OP_CONT:
case GGML_OP_DUP: case GGML_OP_DUP:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_NORM: case GGML_OP_NORM:
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
@ -5601,7 +5601,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_ROPE: case GGML_OP_ROPE:
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
case GGML_OP_NONE:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
break; break;
@ -5654,12 +5653,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_DUP: case GGML_OP_DUP:
ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node); ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
break;
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_NONE:
break; break;
case GGML_OP_NORM: case GGML_OP_NORM:
ggml_vk_norm(ctx, ctx->compute_ctx, src0, node); ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
@ -5712,7 +5705,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
return; return;
} }
extra->ready = true;
extra->ctx_idx = ctx->compute_ctx->idx; extra->ctx_idx = ctx->compute_ctx->idx;
#ifdef GGML_VULKAN_CHECK_RESULTS #ifdef GGML_VULKAN_CHECK_RESULTS
@ -5796,8 +5788,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
ggml_vk_check_results_0(ctx, params, tensor); ggml_vk_check_results_0(ctx, params, tensor);
#endif #endif
GGML_ASSERT(extra->ready);
vk_context& subctx = ctx->gc.contexts[extra->ctx_idx]; vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
// Only run if ctx hasn't been submitted yet // Only run if ctx hasn't been submitted yet
@ -5822,8 +5812,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
subctx.out_memcpys.clear(); subctx.out_memcpys.clear();
} }
extra->ready = false;
return true; return true;
} }
@ -5943,7 +5931,9 @@ struct ggml_backend_vk_buffer_context {
~ggml_backend_vk_buffer_context() { ~ggml_backend_vk_buffer_context() {
ggml_vk_destroy_buffer(dev_buffer); ggml_vk_destroy_buffer(dev_buffer);
delete[] temp_tensor_extras; if (temp_tensor_extras != nullptr) {
delete[] temp_tensor_extras;
}
} }
ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() { ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
@ -5990,18 +5980,16 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
#endif #endif
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra(); if (tensor->view_src != nullptr) {
if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) {
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
ggml_tensor_extra_gpu * extra_view = (ggml_tensor_extra_gpu *) tensor->view_src->extra; GGML_ASSERT(tensor->view_src->extra != nullptr);
extra->buffer_gpu = extra_view->buffer_gpu; tensor->extra = tensor->view_src->extra;
extra->offset = extra_view->offset + tensor->view_offs;
} else { } else {
ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
extra->buffer_gpu = ctx->dev_buffer; extra->buffer_gpu = ctx->dev_buffer;
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
tensor->extra = extra;
} }
tensor->extra = extra;
} }
GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@ -6014,7 +6002,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
vk_buffer buf = extra->buffer_gpu.lock(); vk_buffer buf = extra->buffer_gpu.lock();
ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size); ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
} }
GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@ -6027,7 +6015,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
vk_buffer buf = extra->buffer_gpu.lock(); vk_buffer buf = extra->buffer_gpu.lock();
ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size); ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
} }
GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
@ -6038,7 +6026,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
vk_buffer src_buf = src_extra->buffer_gpu.lock(); vk_buffer src_buf = src_extra->buffer_gpu.lock();
vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src)); ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
return true; return true;
} }
@ -6264,7 +6252,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
vk_buffer buf = extra->buffer_gpu.lock(); vk_buffer buf = extra->buffer_gpu.lock();
ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size); ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
} }
GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@ -6284,7 +6272,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
vk_buffer buf = extra->buffer_gpu.lock(); vk_buffer buf = extra->buffer_gpu.lock();
ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size); ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
} }
GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
@ -6305,7 +6293,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
vk_buffer src_buf = src_extra->buffer_gpu.lock(); vk_buffer src_buf = src_extra->buffer_gpu.lock();
vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src)); ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
return true; return true;
} }
@ -6478,11 +6466,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
// return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
// } break; // } break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
{ return true;
const int mode = ((const int32_t *) op->op_params)[2];
return true;
} break;
case GGML_OP_NONE: case GGML_OP_NONE:
case GGML_OP_RESHAPE: case GGML_OP_RESHAPE:
case GGML_OP_VIEW: case GGML_OP_VIEW:
@ -6725,7 +6709,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
vk_buffer buffer_gpu = extra->buffer_gpu.lock(); vk_buffer buffer_gpu = extra->buffer_gpu.lock();
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size); ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
} }
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
@ -6809,7 +6793,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
} else if (ggml_backend_buffer_is_vk(src0->buffer)) { } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
vk_buffer buffer_gpu = extra->buffer_gpu.lock(); vk_buffer buffer_gpu = extra->buffer_gpu.lock();
uint64_t offset = extra->offset; uint64_t offset = extra->offset + src0->view_offs;
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) { if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
for (int i3 = 0; i3 < src0->ne[3]; i3++) { for (int i3 = 0; i3 < src0->ne[3]; i3++) {
for (int i2 = 0; i2 < src0->ne[2]; i2++) { for (int i2 = 0; i2 < src0->ne[2]; i2++) {
@ -6851,7 +6835,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
} else if (ggml_backend_buffer_is_vk(src1->buffer)) { } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
vk_buffer buffer_gpu = extra->buffer_gpu.lock(); vk_buffer buffer_gpu = extra->buffer_gpu.lock();
uint64_t offset = extra->offset; uint64_t offset = extra->offset + src1->view_offs;
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) { if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
for (int i3 = 0; i3 < src1->ne[3]; i3++) { for (int i3 = 0; i3 < src1->ne[3]; i3++) {
for (int i2 = 0; i2 < src1->ne[2]; i2++) { for (int i2 = 0; i2 < src1->ne[2]; i2++) {
@ -6909,7 +6893,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
} else if (ggml_backend_buffer_is_vk(src2->buffer)) { } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
vk_buffer buffer_gpu = extra->buffer_gpu.lock(); vk_buffer buffer_gpu = extra->buffer_gpu.lock();
uint64_t offset = extra->offset; uint64_t offset = extra->offset + src2->view_offs;
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) { if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
for (int i3 = 0; i3 < src2->ne[3]; i3++) { for (int i3 = 0; i3 < src2->ne[3]; i3++) {
for (int i2 = 0; i2 < src2->ne[2]; i2++) { for (int i2 = 0; i2 < src2->ne[2]; i2++) {
@ -7092,11 +7076,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
vk_buffer buffer_gpu = extra->buffer_gpu.lock(); vk_buffer buffer_gpu = extra->buffer_gpu.lock();
if (extra->offset + tensor_size >= buffer_gpu->size) { if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
tensor_size = buffer_gpu->size - (extra->offset); tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
} }
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size); ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
} }
float first_error_result = -1.0f; float first_error_result = -1.0f;

View File

@ -15237,6 +15237,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (imatrix_data) { if (imatrix_data) {
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
qs.has_imatrix = true; qs.has_imatrix = true;
// check imatrix for nans or infs
for (const auto & kv : *imatrix_data) {
for (float f : kv.second) {
if (!std::isfinite(f)) {
throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
}
}
}
} }
} }