mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-28 12:24:35 +00:00
fix n_embd + remove llama.cpp hacks
This commit is contained in:
parent
dcf2230afb
commit
86d0ad5ef4
@ -152,7 +152,7 @@ config = {
|
|||||||
"architectures": [
|
"architectures": [
|
||||||
"OuteTTSVocoder"
|
"OuteTTSVocoder"
|
||||||
],
|
],
|
||||||
"hidden_size": 512,
|
"hidden_size": 1282,
|
||||||
"vocab_size": 4096,
|
"vocab_size": 4096,
|
||||||
"n_head": 1,
|
"n_head": 1,
|
||||||
"layer_norm_epsilon": 1e-6,
|
"layer_norm_epsilon": 1e-6,
|
||||||
|
@ -168,9 +168,10 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
LOG_INF("%s: time for prompt: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
|
LOG_INF("%s: time for prompt: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
|
||||||
|
|
||||||
|
const int n_embd = llama_n_embd(model_cts);
|
||||||
const float * embd = llama_get_embeddings(ctx_cts);
|
const float * embd = llama_get_embeddings(ctx_cts);
|
||||||
|
|
||||||
int n = 1282*261;
|
int n = n_embd*261;
|
||||||
|
|
||||||
LOG("result:\n");
|
LOG("result:\n");
|
||||||
for (int i = 0; i < 10; ++i) {
|
for (int i = 0; i < 10; ++i) {
|
||||||
|
@ -9539,12 +9539,12 @@ static bool llm_load_tensors(
|
|||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_OUTETTS_VOC:
|
case LLM_ARCH_OUTETTS_VOC:
|
||||||
{
|
{
|
||||||
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {512, n_vocab}, 0);
|
||||||
|
|
||||||
model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {768}, 0);
|
model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {768}, 0);
|
||||||
model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {768}, 0);
|
model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {768}, 0);
|
||||||
|
|
||||||
model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd, 768}, 0);
|
model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, 512, 768}, 0);
|
||||||
model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {768}, 0);
|
model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {768}, 0);
|
||||||
|
|
||||||
model.posnet_0_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {768}, 0);
|
model.posnet_0_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {768}, 0);
|
||||||
@ -9636,8 +9636,8 @@ static bool llm_load_tensors(
|
|||||||
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
|
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
|
||||||
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {768}, 0);
|
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {768}, 0);
|
||||||
|
|
||||||
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, 0);
|
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, n_embd}, 0);
|
||||||
model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {1282}, 0);
|
model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
|
||||||
|
|
||||||
model.hann_window = create_tensor(tn(LLM_TENSOR_HANN_WINDOW, "weight"), {1280}, 0);
|
model.hann_window = create_tensor(tn(LLM_TENSOR_HANN_WINDOW, "weight"), {1280}, 0);
|
||||||
} break;
|
} break;
|
||||||
@ -17432,14 +17432,12 @@ struct llm_build_context {
|
|||||||
model.output_norm,
|
model.output_norm,
|
||||||
model.output_norm_b,
|
model.output_norm_b,
|
||||||
LLM_NORM, cb, -1);
|
LLM_NORM, cb, -1);
|
||||||
cb(cur, "result_norm", -1);
|
|
||||||
|
|
||||||
// lm_head
|
// lm_head
|
||||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||||
cb(cur, "result_output_no_bias", -1);
|
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, model.output_b);
|
cur = ggml_add(ctx0, cur, model.output_b);
|
||||||
cb(cur, "result_output", -1);
|
cb(cur, "result_embd", -1);
|
||||||
|
|
||||||
printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
|
printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
|
||||||
|
|
||||||
@ -17732,8 +17730,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
|
|
||||||
// add on pooling layer
|
// add on pooling layer
|
||||||
if (lctx.cparams.embeddings) {
|
if (lctx.cparams.embeddings) {
|
||||||
// TODO: TEMPORARY DISABLED [OUTETTS]
|
result = llm.append_pooling(result);
|
||||||
//result = llm.append_pooling(result);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
llm.free();
|
llm.free();
|
||||||
@ -18221,13 +18218,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
|
const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
|
||||||
|
|
||||||
// TODO: TEMPORARY !!! [OUTETTS]
|
|
||||||
#if 0
|
|
||||||
const size_t new_size = (logits_size + embd_size) * sizeof(float);
|
const size_t new_size = (logits_size + embd_size) * sizeof(float);
|
||||||
#else
|
|
||||||
const size_t new_size = 1024*1024*32;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// alloc only when more than the current capacity is required
|
// alloc only when more than the current capacity is required
|
||||||
// TODO: also consider shrinking the buffer
|
// TODO: also consider shrinking the buffer
|
||||||
@ -18501,14 +18492,9 @@ static int llama_decode_internal(
|
|||||||
|
|
||||||
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
||||||
|
|
||||||
struct ggml_tensor * res = nullptr;
|
|
||||||
struct ggml_tensor * embd = nullptr;
|
|
||||||
|
|
||||||
// TODO: TEMPORARY DISABLED [OUTETTS]
|
|
||||||
if (model.arch != LLM_ARCH_OUTETTS_VOC) {
|
|
||||||
// the output is always the last tensor in the graph
|
// the output is always the last tensor in the graph
|
||||||
res = ggml_graph_node(gf, -1);
|
struct ggml_tensor * res = ggml_graph_node(gf, -1);
|
||||||
embd = ggml_graph_node(gf, -2);
|
struct ggml_tensor * embd = ggml_graph_node(gf, -2);
|
||||||
|
|
||||||
if (lctx.n_outputs == 0) {
|
if (lctx.n_outputs == 0) {
|
||||||
// no output
|
// no output
|
||||||
@ -18528,10 +18514,7 @@ if (model.arch != LLM_ARCH_OUTETTS_VOC) {
|
|||||||
embd = nullptr; // do not extract embeddings when not needed
|
embd = nullptr; // do not extract embeddings when not needed
|
||||||
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
res = nullptr;
|
|
||||||
embd = ggml_graph_node(gf, -1);
|
|
||||||
}
|
|
||||||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||||
|
|
||||||
ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
|
ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
|
||||||
@ -18599,9 +18582,7 @@ if (model.arch != LLM_ARCH_OUTETTS_VOC) {
|
|||||||
if (n_outputs_new) {
|
if (n_outputs_new) {
|
||||||
GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
|
GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
|
||||||
GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
|
GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
|
||||||
// TODO: TEMPORARY [OUTETTS]
|
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
|
||||||
//ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
|
|
||||||
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*1282*sizeof(float));
|
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_POOLING_TYPE_MEAN:
|
case LLAMA_POOLING_TYPE_MEAN:
|
||||||
|
Loading…
Reference in New Issue
Block a user