sync master

This commit is contained in:
caitianchi 2024-05-29 03:06:58 +08:00
parent 28d4a7f9cc
commit 02eb445d73

View File

@ -1768,17 +1768,24 @@ static llama_state g_state;
// available llama models // available llama models
enum e_model { enum e_model {
MODEL_UNKNOWN, MODEL_UNKNOWN,
MODEL_14M,
MODEL_17M, MODEL_17M,
MODEL_22M, MODEL_22M,
MODEL_33M, MODEL_33M,
MODEL_70M,
MODEL_109M, MODEL_109M,
MODEL_137M, MODEL_137M,
MODEL_160M,
MODEL_335M, MODEL_335M,
MODEL_410M,
MODEL_0_5B, MODEL_0_5B,
MODEL_1B, MODEL_1B,
MODEL_1_4B,
MODEL_2B, MODEL_2B,
MODEL_2_8B,
MODEL_3B, MODEL_3B,
MODEL_4B, MODEL_4B,
MODEL_6_9B,
MODEL_7B, MODEL_7B,
MODEL_8B, MODEL_8B,
MODEL_12B, MODEL_12B,
@ -1813,6 +1820,7 @@ static const size_t GiB = 1024*MiB;
struct llama_hparams { struct llama_hparams {
bool vocab_only; bool vocab_only;
bool rope_finetuned; bool rope_finetuned;
bool use_par_res;
uint32_t n_vocab; uint32_t n_vocab;
uint32_t n_ctx_train; // context size the model was trained on uint32_t n_ctx_train; // context size the model was trained on
@ -2578,7 +2586,6 @@ static bool llama_kv_cache_init(
static bool llama_kv_cache_find_slot( static bool llama_kv_cache_find_slot(
struct llama_kv_cache & cache, struct llama_kv_cache & cache,
const struct llama_batch & batch) { const struct llama_batch & batch) {
const uint32_t n_ctx = cache.size;
const uint32_t n_tokens = batch.n_tokens; const uint32_t n_tokens = batch.n_tokens;
if (cache.recurrent) { if (cache.recurrent) {
@ -2629,16 +2636,16 @@ static bool llama_kv_cache_find_slot(
} }
// otherwise, one cell per token. // otherwise, one cell per token.
if (n_tokens > n_ctx) { if (n_tokens > cache.size) {
LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx); LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
return false; return false;
} }
uint32_t n_tested = 0; uint32_t n_tested = 0;
while (true) { while (true) {
if (cache.head + n_tokens > n_ctx) { if (cache.head + n_tokens > cache.size) {
n_tested += n_ctx - cache.head; n_tested += cache.size - cache.head;
cache.head = 0; cache.head = 0;
continue; continue;
} }
@ -2657,7 +2664,7 @@ static bool llama_kv_cache_find_slot(
break; break;
} }
if (n_tested >= n_ctx) { if (n_tested >= cache.size) {
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
return false; return false;
} }
@ -5176,28 +5183,19 @@ static bool llm_load_tensors(
case LLM_ARCH_MINICPM: case LLM_ARCH_MINICPM:
{ {
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
// if output is NULL, init from the input tok embed // output
if (model.output == NULL) { {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
ml.n_created--; // artificial tensor if (model.arch != LLM_ARCH_MINICPM){
ml.size_data += ggml_nbytes(model.output); model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (model.output == NULL) {
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
}
}
} }
// // output
// {
// model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
// if (model.arch != LLM_ARCH_MINICPM){
// model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
// // if output is NULL, init from the input tok embed
// if (model.output == NULL) {
// model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
// }
// }
// }
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
for (int i = 0; i < n_layer; ++i) { for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i); ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i); ggml_context * ctx_split = ctx_for_layer_split(i);
@ -10161,9 +10159,7 @@ struct llm_build_context {
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
// scale the input embeddings // scale the input embeddings
if (batch.token) { inpL = ggml_scale(ctx0, inpL, scale_embd);
inpL = ggml_scale(ctx0, inpL, scale_embd);
}
cb(inpL, "inp_scaled", -1); cb(inpL, "inp_scaled", -1);
// inp_pos - contains the positions // inp_pos - contains the positions
@ -10279,8 +10275,7 @@ struct llm_build_context {
cb(cur, "lmhead_scaling", -1); cb(cur, "lmhead_scaling", -1);
// lm_head // lm_head
// cur = ggml_mul_mat(ctx0, model.tok_embd, cur); cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
cur = ggml_mul_mat(ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
@ -16236,33 +16231,16 @@ struct llama_model * llama_load_model_from_file(
} }
model->rpc_servers.push_back(servers); model->rpc_servers.push_back(servers);
} }
// int status = llama_model_load(path_model, *model, params); int status = llama_model_load(path_model, *model, params);
// GGML_ASSERT(status <= 0); GGML_ASSERT(status <= 0);
// if (status < 0) { if (status < 0) {
// if (status == -1) { if (status == -1) {
// LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
// } else if (status == -2) { } else if (status == -2) {
// LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
// }
// delete model;
// return nullptr;
// }
try {
int status = llama_model_load(path_model, *model, params);
GGML_ASSERT(status <= 0);
if (status < 0) {
if (status == -1) {
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
} else if (status == -2) {
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
}
delete model;
return nullptr;
} }
} catch (...) {
LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
delete model; delete model;
throw; return nullptr;
} }
return model; return model;
@ -16649,7 +16627,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
// these models do not use RoPE // these models do not use RoPE
case LLM_ARCH_GPT2: case LLM_ARCH_GPT2:
case LLM_ARCH_GPTJ: case LLM_ARCH_GPTJ:
case LLM_ARCH_GPTNEOX:
case LLM_ARCH_MPT: case LLM_ARCH_MPT:
case LLM_ARCH_REFACT: case LLM_ARCH_REFACT:
case LLM_ARCH_BLOOM: case LLM_ARCH_BLOOM:
@ -16687,6 +16664,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_PHI3: case LLM_ARCH_PHI3:
case LLM_ARCH_GEMMA: case LLM_ARCH_GEMMA:
case LLM_ARCH_STARCODER2: case LLM_ARCH_STARCODER2:
case LLM_ARCH_GPTNEOX:
return LLAMA_ROPE_TYPE_NEOX; return LLAMA_ROPE_TYPE_NEOX;
// all model arches should be listed explicitly here // all model arches should be listed explicitly here