Compare commits

..

6 Commits

Author SHA1 Message Date
pancake
7a77786991
Merge a279f17815 into 815fe72adc 2024-11-01 09:33:20 +01:00
Georgi Gerganov
815fe72adc
sync : ggml
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
2024-11-01 10:28:24 +02:00
Georgi Gerganov
f221d56220
ggml : alloc ggml_contexts on the heap (whisper/2525) 2024-11-01 10:24:50 +02:00
Zhenwei Jin
e597e50794
build: fix build error in Windows env with OneAPI setup (#10107) 2024-11-01 11:09:59 +08:00
Diego Devesa
85679d37f3
llama : improve output buffer type selection (#10098)
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
2024-11-01 00:49:53 +01:00
Diego Devesa
1e9f94994e
quantize : fix --keep-split (#10114) 2024-11-01 00:45:34 +01:00
5 changed files with 59 additions and 88 deletions

View File

@ -217,7 +217,6 @@
#define GGML_MAX_DIMS 4 #define GGML_MAX_DIMS 4
#define GGML_MAX_PARAMS 2048 #define GGML_MAX_PARAMS 2048
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 10 #define GGML_MAX_SRC 10
#define GGML_MAX_N_THREADS 512 #define GGML_MAX_N_THREADS 512
#define GGML_MAX_OP_PARAMS 64 #define GGML_MAX_OP_PARAMS 64
@ -657,6 +656,7 @@ extern "C" {
}; };
// scratch buffer // scratch buffer
// TODO: deprecate and remove
struct ggml_scratch { struct ggml_scratch {
size_t offs; size_t offs;
size_t size; size_t size;
@ -760,8 +760,9 @@ extern "C" {
// main // main
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
GGML_API void ggml_free(struct ggml_context * ctx); GGML_API void ggml_reset(struct ggml_context * ctx);
GGML_API void ggml_free (struct ggml_context * ctx);
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);

View File

@ -1402,7 +1402,7 @@ list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads)
find_library(MATH_LIBRARY m) find_library(MATH_LIBRARY m)
if (MATH_LIBRARY) if (MATH_LIBRARY)
if (NOT WIN32 OR NOT GGML_SYCL) if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
list(APPEND GGML_EXTRA_LIBS_PRIVATE m) list(APPEND GGML_EXTRA_LIBS_PRIVATE m)
endif() endif()
endif() endif()

View File

@ -306,6 +306,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
} }
#define GGML_DEBUG 0 #define GGML_DEBUG 0
#define GGML_GELU_FP16 #define GGML_GELU_FP16
#define GGML_GELU_QUICK_FP16 #define GGML_GELU_QUICK_FP16
@ -2014,7 +2015,7 @@ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
struct ggml_context { struct ggml_context {
size_t mem_size; size_t mem_size;
void* mem_buffer; void * mem_buffer;
bool mem_buffer_owned; bool mem_buffer_owned;
bool no_alloc; bool no_alloc;
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
@ -3263,7 +3264,6 @@ struct ggml_numa_nodes {
// //
struct ggml_state { struct ggml_state {
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
struct ggml_numa_nodes numa; struct ggml_numa_nodes numa;
}; };
@ -3845,7 +3845,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
const uint64_t t_start = ggml_time_us(); UNUSED(t_start); const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
g_state = (struct ggml_state) { g_state = (struct ggml_state) {
/*.contexts =*/ { { 0 } },
/*.numa =*/ { /*.numa =*/ {
.n_nodes = 0, .n_nodes = 0,
.total_cpus = 0, .total_cpus = 0,
@ -3864,26 +3863,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
is_first_call = false; is_first_call = false;
} }
// find non-used context in g_state ggml_critical_section_end();
struct ggml_context * ctx = NULL;
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { struct ggml_context * ctx = GGML_MALLOC(sizeof(struct ggml_context));
if (!g_state.contexts[i].used) {
g_state.contexts[i].used = true;
ctx = &g_state.contexts[i].context;
GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
break;
}
}
if (ctx == NULL) {
GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
ggml_critical_section_end();
return NULL;
}
// allow to call ggml_init with 0 size // allow to call ggml_init with 0 size
if (params.mem_size == 0) { if (params.mem_size == 0) {
@ -3911,42 +3893,31 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
GGML_PRINT_DEBUG("%s: context initialized\n", __func__); GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
ggml_critical_section_end();
return ctx; return ctx;
} }
void ggml_reset(struct ggml_context * ctx) {
if (ctx == NULL) {
return;
}
ctx->n_objects = 0;
ctx->objects_begin = NULL;
ctx->objects_end = NULL;
ctx->scratch = (struct ggml_scratch) { 0, 0, NULL, };
ctx->scratch_save = (struct ggml_scratch) { 0, 0, NULL, };
}
void ggml_free(struct ggml_context * ctx) { void ggml_free(struct ggml_context * ctx) {
if (ctx == NULL) { if (ctx == NULL) {
return; return;
} }
// make this function thread safe if (ctx->mem_buffer_owned) {
ggml_critical_section_start(); ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
bool found = false;
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
if (&g_state.contexts[i].context == ctx) {
g_state.contexts[i].used = false;
GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
__func__, i, ggml_used_mem(ctx));
if (ctx->mem_buffer_owned) {
ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
}
found = true;
break;
}
} }
if (!found) { GGML_FREE(ctx);
GGML_PRINT_DEBUG("%s: context not found\n", __func__);
}
ggml_critical_section_end();
} }
size_t ggml_used_mem(const struct ggml_context * ctx) { size_t ggml_used_mem(const struct ggml_context * ctx) {

View File

@ -1 +1 @@
162e232411ee98ceb0cccfa84886118d917d2123 bb78a40dc60e04c626bac2b65840b509988e990d

View File

@ -4860,19 +4860,12 @@ struct llama_model_loader {
*last = 0; *last = 0;
*addr = mapping->addr; *addr = mapping->addr;
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
try { const auto * weight = get_weight(ggml_get_name(tensor));
const auto * weight = get_weight(ggml_get_name(tensor)); if (!weight || weight->idx != idx) {
if (!weight) { continue;
continue;
}
if (weight->idx != idx) {
continue;
}
*first = std::min(*first, weight->offs);
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
} catch(...) {
// the tensor is not in the model
} }
*first = std::min(*first, weight->offs);
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
} }
} }
@ -5049,7 +5042,6 @@ struct llama_model_loader {
ggml_backend_tensor_set(cur, data, 0, n_size); ggml_backend_tensor_set(cur, data, 0, n_size);
} }
} else { } else {
GGML_ASSERT(weight->idx < files.size());
const auto & file = files.at(weight->idx); const auto & file = files.at(weight->idx);
if (ggml_backend_buffer_is_host(cur->buffer)) { if (ggml_backend_buffer_is_host(cur->buffer)) {
file->seek(weight->offs, SEEK_SET); file->seek(weight->offs, SEEK_SET);
@ -17170,18 +17162,10 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
auto * buft = ggml_backend_cpu_buffer_type(); auto * buft = ggml_backend_cpu_buffer_type();
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
ggml_tensor * output_tensor = lctx.model.output; auto * output_dev = lctx.model.dev_output.dev;
if (!output_tensor) { auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
// bert models don't have an output tensor, use the last layer if (output_dev_host_buft) {
output_tensor = lctx.model.layers.back().layer_out_norm; buft = output_dev_host_buft;
}
if (output_tensor) {
auto * output_buft = ggml_backend_buffer_get_type(output_tensor->buffer);
auto * output_dev = ggml_backend_buft_get_device(output_buft);
auto * output_dev_host_buft = ggml_backend_dev_host_buffer_type(output_dev);
if (output_dev_host_buft) {
buft = output_dev_host_buft;
}
} }
lctx.buf_output = ggml_backend_buft_alloc_buffer(buft, new_size); lctx.buf_output = ggml_backend_buft_alloc_buffer(buft, new_size);
if (lctx.buf_output == nullptr) { if (lctx.buf_output == nullptr) {
@ -18623,8 +18607,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} }
} }
// make a list of weights
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
tensors.reserve(ml.weights_map.size());
for (const auto & it : ml.weights_map) { for (const auto & it : ml.weights_map) {
const struct ggml_tensor * tensor = it.second.tensor; tensors.push_back(&it.second);
}
// keep_split requires that the weights are sorted by split index
if (params->keep_split) {
std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
if (a->idx == b->idx) {
return a->offs < b->offs;
}
return a->idx < b->idx;
});
}
for (const auto * it : tensors) {
const struct ggml_tensor * tensor = it->tensor;
const std::string name = ggml_get_name(tensor); const std::string name = ggml_get_name(tensor);
@ -18664,22 +18665,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::vector<no_init<float>> f32_conv_buf; std::vector<no_init<float>> f32_conv_buf;
uint16_t n_split = 1; uint16_t n_split = 1;
const auto & weights_map = ml.weights_map;
// Assume split index is continuous // Assume split index is continuous
if (params->keep_split) { if (params->keep_split) {
for (const auto & it : weights_map) { for (const auto * it : tensors) {
n_split = std::max(uint16_t(it.second.idx + 1), n_split); n_split = std::max(uint16_t(it->idx + 1), n_split);
} }
} }
std::vector<gguf_context*> ctx_outs(n_split, NULL); std::vector<gguf_context*> ctx_outs(n_split, NULL);
ctx_outs[0] = ctx_out; ctx_outs[0] = ctx_out;
// populate the original tensors so we get an initial meta data // populate the original tensors so we get an initial meta data
for (const auto & it : weights_map) { for (const auto * it : tensors) {
uint16_t i_split = params->keep_split ? it.second.idx : 0; uint16_t i_split = params->keep_split ? it->idx : 0;
struct ggml_tensor * tensor = it.second.tensor; struct ggml_tensor * tensor = it->tensor;
if (ctx_outs[i_split] == NULL) { if (ctx_outs[i_split] == NULL) {
ctx_outs[i_split] = gguf_init_empty(); ctx_outs[i_split] = gguf_init_empty();
} }
@ -18726,8 +18725,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
const auto tn = LLM_TN(model.arch); const auto tn = LLM_TN(model.arch);
new_ofstream(0); new_ofstream(0);
for (const auto & it : weights_map) { for (const auto * it : tensors) {
const auto & weight = it.second; const auto & weight = *it;
struct ggml_tensor * tensor = weight.tensor; struct ggml_tensor * tensor = weight.tensor;
if (weight.idx != cur_split && params->keep_split) { if (weight.idx != cur_split && params->keep_split) {
close_ofstream(); close_ofstream();