mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 03:31:46 +00:00
llama : consistently catch and throw only exceptions deriving from std::exception (#1599)
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
9d0693bce3
commit
c2df36d60d
55
llama.cpp
55
llama.cpp
@ -289,15 +289,15 @@ template <typename T>
|
|||||||
static T checked_mul(T a, T b) {
|
static T checked_mul(T a, T b) {
|
||||||
T ret = a * b;
|
T ret = a * b;
|
||||||
if (a != 0 && ret / a != b) {
|
if (a != 0 && ret / a != b) {
|
||||||
throw format("overflow multiplying %llu * %llu",
|
throw std::runtime_error(format("overflow multiplying %llu * %llu",
|
||||||
(unsigned long long) a, (unsigned long long) b);
|
(unsigned long long) a, (unsigned long long) b));
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t checked_div(size_t a, size_t b) {
|
static size_t checked_div(size_t a, size_t b) {
|
||||||
if (b == 0 || a % b != 0) {
|
if (b == 0 || a % b != 0) {
|
||||||
throw format("error dividing %zu / %zu", a, b);
|
throw std::runtime_error(format("error dividing %zu / %zu", a, b));
|
||||||
}
|
}
|
||||||
return a / b;
|
return a / b;
|
||||||
}
|
}
|
||||||
@ -361,7 +361,7 @@ struct llama_load_tensor {
|
|||||||
const auto & first_shard = shards.at(0);
|
const auto & first_shard = shards.at(0);
|
||||||
for (const auto & shard : shards) {
|
for (const auto & shard : shards) {
|
||||||
if (shard.type != first_shard.type) {
|
if (shard.type != first_shard.type) {
|
||||||
throw format("inconsistent tensor shard type in '%s'", name.c_str());
|
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
type = first_shard.type;
|
type = first_shard.type;
|
||||||
@ -384,8 +384,8 @@ struct llama_load_tensor {
|
|||||||
const auto & first_shard = shards.at(0);
|
const auto & first_shard = shards.at(0);
|
||||||
for (const auto & shard : shards) {
|
for (const auto & shard : shards) {
|
||||||
if (shard.ne != first_shard.ne) {
|
if (shard.ne != first_shard.ne) {
|
||||||
throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
|
||||||
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
|
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ne = first_shard.ne;
|
ne = first_shard.ne;
|
||||||
@ -463,8 +463,8 @@ struct llama_file_loader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
||||||
magic, version);
|
magic, version));
|
||||||
}
|
}
|
||||||
void read_hparams() {
|
void read_hparams() {
|
||||||
hparams.n_vocab = file.read_u32();
|
hparams.n_vocab = file.read_u32();
|
||||||
@ -504,7 +504,7 @@ struct llama_file_loader {
|
|||||||
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
||||||
std::string name = file.read_string(name_len);
|
std::string name = file.read_string(name_len);
|
||||||
if (n_dims < 1 || n_dims > 2) {
|
if (n_dims < 1 || n_dims > 2) {
|
||||||
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
|
||||||
}
|
}
|
||||||
switch (shard.type) {
|
switch (shard.type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
@ -521,7 +521,7 @@ struct llama_file_loader {
|
|||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
break;
|
break;
|
||||||
default: {
|
default: {
|
||||||
throw format("unrecognized tensor type %u\n", shard.type);
|
throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -630,7 +630,7 @@ struct llama_model_loader {
|
|||||||
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
||||||
file_loaders.emplace_back(ith_file);
|
file_loaders.emplace_back(ith_file);
|
||||||
if (ith_file->hparams != first_file->hparams) {
|
if (ith_file->hparams != first_file->hparams) {
|
||||||
throw format("llama.cpp: hparams inconsistent between files");
|
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!llama_mmap::SUPPORTED) {
|
if (!llama_mmap::SUPPORTED) {
|
||||||
@ -660,7 +660,7 @@ struct llama_model_loader {
|
|||||||
uint32_t guess_n_parts() const {
|
uint32_t guess_n_parts() const {
|
||||||
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
|
||||||
if (it == tensors_map.name_to_idx.end()) {
|
if (it == tensors_map.name_to_idx.end()) {
|
||||||
throw std::string("missing tok_embeddings.weight");
|
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
|
||||||
}
|
}
|
||||||
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
||||||
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
|
||||||
@ -677,12 +677,12 @@ struct llama_model_loader {
|
|||||||
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
||||||
auto it = tensors_map.name_to_idx.find(name);
|
auto it = tensors_map.name_to_idx.find(name);
|
||||||
if (it == tensors_map.name_to_idx.end()) {
|
if (it == tensors_map.name_to_idx.end()) {
|
||||||
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
||||||
}
|
}
|
||||||
llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
||||||
if (lt.ne != ne) {
|
if (lt.ne != ne) {
|
||||||
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
||||||
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
return get_tensor_for(lt, backend);
|
return get_tensor_for(lt, backend);
|
||||||
@ -706,7 +706,7 @@ struct llama_model_loader {
|
|||||||
|
|
||||||
void done_getting_tensors() const {
|
void done_getting_tensors() const {
|
||||||
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
||||||
throw std::string("llama.cpp: file contained more tensors than expected");
|
throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -994,7 +994,7 @@ static void llama_model_load_internal(
|
|||||||
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
||||||
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
||||||
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
||||||
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1002,7 +1002,7 @@ static void llama_model_load_internal(
|
|||||||
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
||||||
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
||||||
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
||||||
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1033,7 +1033,7 @@ static void llama_model_load_internal(
|
|||||||
|
|
||||||
model.ctx = ggml_init(params);
|
model.ctx = ggml_init(params);
|
||||||
if (!model.ctx) {
|
if (!model.ctx) {
|
||||||
throw format("ggml_init() failed");
|
throw std::runtime_error(format("ggml_init() failed"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1214,8 +1214,8 @@ static bool llama_model_load(
|
|||||||
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
|
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
|
||||||
vocab_only, progress_callback, progress_callback_user_data);
|
vocab_only, progress_callback, progress_callback_user_data);
|
||||||
return true;
|
return true;
|
||||||
} catch (const std::string & err) {
|
} catch (const std::exception & err) {
|
||||||
fprintf(stderr, "error loading model: %s\n", err.c_str());
|
fprintf(stderr, "error loading model: %s\n", err.what());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2120,6 +2120,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
||||||
|
|
||||||
// K-quants
|
// K-quants
|
||||||
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
||||||
@ -2130,7 +2131,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
||||||
default: throw format("invalid output file type %d\n", ftype);
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nthread <= 0) {
|
if (nthread <= 0) {
|
||||||
@ -2231,7 +2232,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
|
throw std::runtime_error(format("type %s unsupported for integer quantization", ggml_type_name(tensor.type)));
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("quantizing .. ");
|
printf("quantizing .. ");
|
||||||
@ -2433,8 +2434,8 @@ int llama_model_quantize(
|
|||||||
try {
|
try {
|
||||||
llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
|
llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
|
||||||
return 0;
|
return 0;
|
||||||
} catch (const std::string & err) {
|
} catch (const std::exception & err) {
|
||||||
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2687,8 +2688,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|||||||
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||||
try {
|
try {
|
||||||
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
||||||
} catch (const std::string & err) {
|
} catch (const std::exception & err) {
|
||||||
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
|
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user