mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 19:50:17 +00:00
sync master
This commit is contained in:
parent
28d4a7f9cc
commit
02eb445d73
90
llama.cpp
90
llama.cpp
@ -1768,17 +1768,24 @@ static llama_state g_state;
|
|||||||
// available llama models
|
// available llama models
|
||||||
enum e_model {
|
enum e_model {
|
||||||
MODEL_UNKNOWN,
|
MODEL_UNKNOWN,
|
||||||
|
MODEL_14M,
|
||||||
MODEL_17M,
|
MODEL_17M,
|
||||||
MODEL_22M,
|
MODEL_22M,
|
||||||
MODEL_33M,
|
MODEL_33M,
|
||||||
|
MODEL_70M,
|
||||||
MODEL_109M,
|
MODEL_109M,
|
||||||
MODEL_137M,
|
MODEL_137M,
|
||||||
|
MODEL_160M,
|
||||||
MODEL_335M,
|
MODEL_335M,
|
||||||
|
MODEL_410M,
|
||||||
MODEL_0_5B,
|
MODEL_0_5B,
|
||||||
MODEL_1B,
|
MODEL_1B,
|
||||||
|
MODEL_1_4B,
|
||||||
MODEL_2B,
|
MODEL_2B,
|
||||||
|
MODEL_2_8B,
|
||||||
MODEL_3B,
|
MODEL_3B,
|
||||||
MODEL_4B,
|
MODEL_4B,
|
||||||
|
MODEL_6_9B,
|
||||||
MODEL_7B,
|
MODEL_7B,
|
||||||
MODEL_8B,
|
MODEL_8B,
|
||||||
MODEL_12B,
|
MODEL_12B,
|
||||||
@ -1813,6 +1820,7 @@ static const size_t GiB = 1024*MiB;
|
|||||||
struct llama_hparams {
|
struct llama_hparams {
|
||||||
bool vocab_only;
|
bool vocab_only;
|
||||||
bool rope_finetuned;
|
bool rope_finetuned;
|
||||||
|
bool use_par_res;
|
||||||
|
|
||||||
uint32_t n_vocab;
|
uint32_t n_vocab;
|
||||||
uint32_t n_ctx_train; // context size the model was trained on
|
uint32_t n_ctx_train; // context size the model was trained on
|
||||||
@ -2578,7 +2586,6 @@ static bool llama_kv_cache_init(
|
|||||||
static bool llama_kv_cache_find_slot(
|
static bool llama_kv_cache_find_slot(
|
||||||
struct llama_kv_cache & cache,
|
struct llama_kv_cache & cache,
|
||||||
const struct llama_batch & batch) {
|
const struct llama_batch & batch) {
|
||||||
const uint32_t n_ctx = cache.size;
|
|
||||||
const uint32_t n_tokens = batch.n_tokens;
|
const uint32_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
if (cache.recurrent) {
|
if (cache.recurrent) {
|
||||||
@ -2629,16 +2636,16 @@ static bool llama_kv_cache_find_slot(
|
|||||||
}
|
}
|
||||||
// otherwise, one cell per token.
|
// otherwise, one cell per token.
|
||||||
|
|
||||||
if (n_tokens > n_ctx) {
|
if (n_tokens > cache.size) {
|
||||||
LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
|
LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t n_tested = 0;
|
uint32_t n_tested = 0;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
if (cache.head + n_tokens > n_ctx) {
|
if (cache.head + n_tokens > cache.size) {
|
||||||
n_tested += n_ctx - cache.head;
|
n_tested += cache.size - cache.head;
|
||||||
cache.head = 0;
|
cache.head = 0;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -2657,7 +2664,7 @@ static bool llama_kv_cache_find_slot(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_tested >= n_ctx) {
|
if (n_tested >= cache.size) {
|
||||||
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -5176,28 +5183,19 @@ static bool llm_load_tensors(
|
|||||||
case LLM_ARCH_MINICPM:
|
case LLM_ARCH_MINICPM:
|
||||||
{
|
{
|
||||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
|
||||||
|
|
||||||
// if output is NULL, init from the input tok embed
|
// output
|
||||||
if (model.output == NULL) {
|
{
|
||||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
ml.n_created--; // artificial tensor
|
if (model.arch != LLM_ARCH_MINICPM){
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
// if output is NULL, init from the input tok embed
|
||||||
|
if (model.output == NULL) {
|
||||||
|
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// // output
|
|
||||||
// {
|
|
||||||
// model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
||||||
// if (model.arch != LLM_ARCH_MINICPM){
|
|
||||||
// model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
||||||
// // if output is NULL, init from the input tok embed
|
|
||||||
// if (model.output == NULL) {
|
|
||||||
// model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||||
@ -10161,9 +10159,7 @@ struct llm_build_context {
|
|||||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
|
||||||
// scale the input embeddings
|
// scale the input embeddings
|
||||||
if (batch.token) {
|
inpL = ggml_scale(ctx0, inpL, scale_embd);
|
||||||
inpL = ggml_scale(ctx0, inpL, scale_embd);
|
|
||||||
}
|
|
||||||
cb(inpL, "inp_scaled", -1);
|
cb(inpL, "inp_scaled", -1);
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
@ -10279,8 +10275,7 @@ struct llm_build_context {
|
|||||||
cb(cur, "lmhead_scaling", -1);
|
cb(cur, "lmhead_scaling", -1);
|
||||||
|
|
||||||
// lm_head
|
// lm_head
|
||||||
// cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
|
cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
|
||||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
||||||
cb(cur, "result_output", -1);
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
@ -16236,33 +16231,16 @@ struct llama_model * llama_load_model_from_file(
|
|||||||
}
|
}
|
||||||
model->rpc_servers.push_back(servers);
|
model->rpc_servers.push_back(servers);
|
||||||
}
|
}
|
||||||
// int status = llama_model_load(path_model, *model, params);
|
int status = llama_model_load(path_model, *model, params);
|
||||||
// GGML_ASSERT(status <= 0);
|
GGML_ASSERT(status <= 0);
|
||||||
// if (status < 0) {
|
if (status < 0) {
|
||||||
// if (status == -1) {
|
if (status == -1) {
|
||||||
// LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
||||||
// } else if (status == -2) {
|
} else if (status == -2) {
|
||||||
// LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
||||||
// }
|
|
||||||
// delete model;
|
|
||||||
// return nullptr;
|
|
||||||
// }
|
|
||||||
try {
|
|
||||||
int status = llama_model_load(path_model, *model, params);
|
|
||||||
GGML_ASSERT(status <= 0);
|
|
||||||
if (status < 0) {
|
|
||||||
if (status == -1) {
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
|
||||||
} else if (status == -2) {
|
|
||||||
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
|
||||||
}
|
|
||||||
delete model;
|
|
||||||
return nullptr;
|
|
||||||
}
|
}
|
||||||
} catch (...) {
|
|
||||||
LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
|
|
||||||
delete model;
|
delete model;
|
||||||
throw;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
return model;
|
return model;
|
||||||
@ -16649,7 +16627,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|||||||
// these models do not use RoPE
|
// these models do not use RoPE
|
||||||
case LLM_ARCH_GPT2:
|
case LLM_ARCH_GPT2:
|
||||||
case LLM_ARCH_GPTJ:
|
case LLM_ARCH_GPTJ:
|
||||||
case LLM_ARCH_GPTNEOX:
|
|
||||||
case LLM_ARCH_MPT:
|
case LLM_ARCH_MPT:
|
||||||
case LLM_ARCH_REFACT:
|
case LLM_ARCH_REFACT:
|
||||||
case LLM_ARCH_BLOOM:
|
case LLM_ARCH_BLOOM:
|
||||||
@ -16687,6 +16664,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|||||||
case LLM_ARCH_PHI3:
|
case LLM_ARCH_PHI3:
|
||||||
case LLM_ARCH_GEMMA:
|
case LLM_ARCH_GEMMA:
|
||||||
case LLM_ARCH_STARCODER2:
|
case LLM_ARCH_STARCODER2:
|
||||||
|
case LLM_ARCH_GPTNEOX:
|
||||||
return LLAMA_ROPE_TYPE_NEOX;
|
return LLAMA_ROPE_TYPE_NEOX;
|
||||||
|
|
||||||
// all model arches should be listed explicitly here
|
// all model arches should be listed explicitly here
|
||||||
|
Loading…
Reference in New Issue
Block a user