mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-14 23:09:53 +00:00
Compare commits
7 Commits
8d2b6381b8
...
7a77786991
Author | SHA1 | Date | |
---|---|---|---|
|
7a77786991 | ||
|
815fe72adc | ||
|
f221d56220 | ||
|
e597e50794 | ||
|
85679d37f3 | ||
|
1e9f94994e | ||
|
a279f17815 |
@ -357,6 +357,10 @@ class Model:
|
|||||||
data_qtype = gguf.GGMLQuantizationType.TQ1_0
|
data_qtype = gguf.GGMLQuantizationType.TQ1_0
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
|
||||||
data_qtype = gguf.GGMLQuantizationType.TQ2_0
|
data_qtype = gguf.GGMLQuantizationType.TQ2_0
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_0:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.Q4_0
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_1:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.Q4_1
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
||||||
|
|
||||||
@ -4296,8 +4300,8 @@ def parse_args() -> argparse.Namespace:
|
|||||||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
|
"--outtype", type=str, choices=["f32", "f16", "bf16", "q4_0", "q4_1", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
|
||||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q4_0, q4_1 , q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bigendian", action="store_true",
|
"--bigendian", action="store_true",
|
||||||
@ -4383,6 +4387,8 @@ def main() -> None:
|
|||||||
"f32": gguf.LlamaFileType.ALL_F32,
|
"f32": gguf.LlamaFileType.ALL_F32,
|
||||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||||
|
"q4_0": gguf.LlamaFileType.MOSTLY_Q4_0,
|
||||||
|
"q4_1": gguf.LlamaFileType.MOSTLY_Q4_1,
|
||||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||||
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
|
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
|
||||||
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
|
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
|
||||||
|
@ -217,7 +217,6 @@
|
|||||||
|
|
||||||
#define GGML_MAX_DIMS 4
|
#define GGML_MAX_DIMS 4
|
||||||
#define GGML_MAX_PARAMS 2048
|
#define GGML_MAX_PARAMS 2048
|
||||||
#define GGML_MAX_CONTEXTS 64
|
|
||||||
#define GGML_MAX_SRC 10
|
#define GGML_MAX_SRC 10
|
||||||
#define GGML_MAX_N_THREADS 512
|
#define GGML_MAX_N_THREADS 512
|
||||||
#define GGML_MAX_OP_PARAMS 64
|
#define GGML_MAX_OP_PARAMS 64
|
||||||
@ -657,6 +656,7 @@ extern "C" {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// scratch buffer
|
// scratch buffer
|
||||||
|
// TODO: deprecate and remove
|
||||||
struct ggml_scratch {
|
struct ggml_scratch {
|
||||||
size_t offs;
|
size_t offs;
|
||||||
size_t size;
|
size_t size;
|
||||||
@ -761,6 +761,7 @@ extern "C" {
|
|||||||
// main
|
// main
|
||||||
|
|
||||||
GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
|
GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
|
||||||
|
GGML_API void ggml_reset(struct ggml_context * ctx);
|
||||||
GGML_API void ggml_free (struct ggml_context * ctx);
|
GGML_API void ggml_free (struct ggml_context * ctx);
|
||||||
|
|
||||||
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
||||||
|
@ -1402,7 +1402,7 @@ list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads)
|
|||||||
|
|
||||||
find_library(MATH_LIBRARY m)
|
find_library(MATH_LIBRARY m)
|
||||||
if (MATH_LIBRARY)
|
if (MATH_LIBRARY)
|
||||||
if (NOT WIN32 OR NOT GGML_SYCL)
|
if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
|
||||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE m)
|
list(APPEND GGML_EXTRA_LIBS_PRIVATE m)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
@ -306,6 +306,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define GGML_DEBUG 0
|
#define GGML_DEBUG 0
|
||||||
|
|
||||||
#define GGML_GELU_FP16
|
#define GGML_GELU_FP16
|
||||||
#define GGML_GELU_QUICK_FP16
|
#define GGML_GELU_QUICK_FP16
|
||||||
|
|
||||||
@ -3263,7 +3264,6 @@ struct ggml_numa_nodes {
|
|||||||
//
|
//
|
||||||
|
|
||||||
struct ggml_state {
|
struct ggml_state {
|
||||||
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
|
|
||||||
struct ggml_numa_nodes numa;
|
struct ggml_numa_nodes numa;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -3845,7 +3845,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||||||
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
||||||
|
|
||||||
g_state = (struct ggml_state) {
|
g_state = (struct ggml_state) {
|
||||||
/*.contexts =*/ { { 0 } },
|
|
||||||
/*.numa =*/ {
|
/*.numa =*/ {
|
||||||
.n_nodes = 0,
|
.n_nodes = 0,
|
||||||
.total_cpus = 0,
|
.total_cpus = 0,
|
||||||
@ -3864,26 +3863,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||||||
is_first_call = false;
|
is_first_call = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// find non-used context in g_state
|
|
||||||
struct ggml_context * ctx = NULL;
|
|
||||||
|
|
||||||
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
|
|
||||||
if (!g_state.contexts[i].used) {
|
|
||||||
g_state.contexts[i].used = true;
|
|
||||||
ctx = &g_state.contexts[i].context;
|
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ctx == NULL) {
|
|
||||||
GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
|
|
||||||
|
|
||||||
ggml_critical_section_end();
|
ggml_critical_section_end();
|
||||||
|
|
||||||
return NULL;
|
struct ggml_context * ctx = GGML_MALLOC(sizeof(struct ggml_context));
|
||||||
}
|
|
||||||
|
|
||||||
// allow to call ggml_init with 0 size
|
// allow to call ggml_init with 0 size
|
||||||
if (params.mem_size == 0) {
|
if (params.mem_size == 0) {
|
||||||
@ -3911,42 +3893,31 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||||||
|
|
||||||
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
||||||
|
|
||||||
ggml_critical_section_end();
|
|
||||||
|
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_reset(struct ggml_context * ctx) {
|
||||||
|
if (ctx == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->n_objects = 0;
|
||||||
|
ctx->objects_begin = NULL;
|
||||||
|
ctx->objects_end = NULL;
|
||||||
|
ctx->scratch = (struct ggml_scratch) { 0, 0, NULL, };
|
||||||
|
ctx->scratch_save = (struct ggml_scratch) { 0, 0, NULL, };
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_free(struct ggml_context * ctx) {
|
void ggml_free(struct ggml_context * ctx) {
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// make this function thread safe
|
|
||||||
ggml_critical_section_start();
|
|
||||||
|
|
||||||
bool found = false;
|
|
||||||
|
|
||||||
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
|
|
||||||
if (&g_state.contexts[i].context == ctx) {
|
|
||||||
g_state.contexts[i].used = false;
|
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
|
|
||||||
__func__, i, ggml_used_mem(ctx));
|
|
||||||
|
|
||||||
if (ctx->mem_buffer_owned) {
|
if (ctx->mem_buffer_owned) {
|
||||||
ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
|
ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
found = true;
|
GGML_FREE(ctx);
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!found) {
|
|
||||||
GGML_PRINT_DEBUG("%s: context not found\n", __func__);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_critical_section_end();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_used_mem(const struct ggml_context * ctx) {
|
size_t ggml_used_mem(const struct ggml_context * ctx) {
|
||||||
|
@ -1 +1 @@
|
|||||||
162e232411ee98ceb0cccfa84886118d917d2123
|
bb78a40dc60e04c626bac2b65840b509988e990d
|
||||||
|
@ -4860,19 +4860,12 @@ struct llama_model_loader {
|
|||||||
*last = 0;
|
*last = 0;
|
||||||
*addr = mapping->addr;
|
*addr = mapping->addr;
|
||||||
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
||||||
try {
|
|
||||||
const auto * weight = get_weight(ggml_get_name(tensor));
|
const auto * weight = get_weight(ggml_get_name(tensor));
|
||||||
if (!weight) {
|
if (!weight || weight->idx != idx) {
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (weight->idx != idx) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
*first = std::min(*first, weight->offs);
|
*first = std::min(*first, weight->offs);
|
||||||
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
|
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
|
||||||
} catch(...) {
|
|
||||||
// the tensor is not in the model
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -5049,7 +5042,6 @@ struct llama_model_loader {
|
|||||||
ggml_backend_tensor_set(cur, data, 0, n_size);
|
ggml_backend_tensor_set(cur, data, 0, n_size);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(weight->idx < files.size());
|
|
||||||
const auto & file = files.at(weight->idx);
|
const auto & file = files.at(weight->idx);
|
||||||
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||||
file->seek(weight->offs, SEEK_SET);
|
file->seek(weight->offs, SEEK_SET);
|
||||||
@ -17170,19 +17162,11 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
|||||||
|
|
||||||
auto * buft = ggml_backend_cpu_buffer_type();
|
auto * buft = ggml_backend_cpu_buffer_type();
|
||||||
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
|
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
|
||||||
ggml_tensor * output_tensor = lctx.model.output;
|
auto * output_dev = lctx.model.dev_output.dev;
|
||||||
if (!output_tensor) {
|
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
|
||||||
// bert models don't have an output tensor, use the last layer
|
|
||||||
output_tensor = lctx.model.layers.back().layer_out_norm;
|
|
||||||
}
|
|
||||||
if (output_tensor) {
|
|
||||||
auto * output_buft = ggml_backend_buffer_get_type(output_tensor->buffer);
|
|
||||||
auto * output_dev = ggml_backend_buft_get_device(output_buft);
|
|
||||||
auto * output_dev_host_buft = ggml_backend_dev_host_buffer_type(output_dev);
|
|
||||||
if (output_dev_host_buft) {
|
if (output_dev_host_buft) {
|
||||||
buft = output_dev_host_buft;
|
buft = output_dev_host_buft;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
lctx.buf_output = ggml_backend_buft_alloc_buffer(buft, new_size);
|
lctx.buf_output = ggml_backend_buft_alloc_buffer(buft, new_size);
|
||||||
if (lctx.buf_output == nullptr) {
|
if (lctx.buf_output == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
||||||
@ -18623,8 +18607,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// make a list of weights
|
||||||
|
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
|
||||||
|
tensors.reserve(ml.weights_map.size());
|
||||||
for (const auto & it : ml.weights_map) {
|
for (const auto & it : ml.weights_map) {
|
||||||
const struct ggml_tensor * tensor = it.second.tensor;
|
tensors.push_back(&it.second);
|
||||||
|
}
|
||||||
|
|
||||||
|
// keep_split requires that the weights are sorted by split index
|
||||||
|
if (params->keep_split) {
|
||||||
|
std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
|
||||||
|
if (a->idx == b->idx) {
|
||||||
|
return a->offs < b->offs;
|
||||||
|
}
|
||||||
|
return a->idx < b->idx;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto * it : tensors) {
|
||||||
|
const struct ggml_tensor * tensor = it->tensor;
|
||||||
|
|
||||||
const std::string name = ggml_get_name(tensor);
|
const std::string name = ggml_get_name(tensor);
|
||||||
|
|
||||||
@ -18664,22 +18665,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
std::vector<no_init<float>> f32_conv_buf;
|
std::vector<no_init<float>> f32_conv_buf;
|
||||||
|
|
||||||
uint16_t n_split = 1;
|
uint16_t n_split = 1;
|
||||||
const auto & weights_map = ml.weights_map;
|
|
||||||
|
|
||||||
// Assume split index is continuous
|
// Assume split index is continuous
|
||||||
if (params->keep_split) {
|
if (params->keep_split) {
|
||||||
for (const auto & it : weights_map) {
|
for (const auto * it : tensors) {
|
||||||
n_split = std::max(uint16_t(it.second.idx + 1), n_split);
|
n_split = std::max(uint16_t(it->idx + 1), n_split);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
std::vector<gguf_context*> ctx_outs(n_split, NULL);
|
std::vector<gguf_context*> ctx_outs(n_split, NULL);
|
||||||
ctx_outs[0] = ctx_out;
|
ctx_outs[0] = ctx_out;
|
||||||
|
|
||||||
// populate the original tensors so we get an initial meta data
|
// populate the original tensors so we get an initial meta data
|
||||||
for (const auto & it : weights_map) {
|
for (const auto * it : tensors) {
|
||||||
uint16_t i_split = params->keep_split ? it.second.idx : 0;
|
uint16_t i_split = params->keep_split ? it->idx : 0;
|
||||||
struct ggml_tensor * tensor = it.second.tensor;
|
struct ggml_tensor * tensor = it->tensor;
|
||||||
if (ctx_outs[i_split] == NULL) {
|
if (ctx_outs[i_split] == NULL) {
|
||||||
ctx_outs[i_split] = gguf_init_empty();
|
ctx_outs[i_split] = gguf_init_empty();
|
||||||
}
|
}
|
||||||
@ -18726,8 +18725,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
|
|
||||||
const auto tn = LLM_TN(model.arch);
|
const auto tn = LLM_TN(model.arch);
|
||||||
new_ofstream(0);
|
new_ofstream(0);
|
||||||
for (const auto & it : weights_map) {
|
for (const auto * it : tensors) {
|
||||||
const auto & weight = it.second;
|
const auto & weight = *it;
|
||||||
struct ggml_tensor * tensor = weight.tensor;
|
struct ggml_tensor * tensor = weight.tensor;
|
||||||
if (weight.idx != cur_split && params->keep_split) {
|
if (weight.idx != cur_split && params->keep_split) {
|
||||||
close_ofstream();
|
close_ofstream();
|
||||||
|
Loading…
Reference in New Issue
Block a user