mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-28 12:24:35 +00:00
extract features
This commit is contained in:
parent
81472a3716
commit
e08f8a5d8c
@ -138,6 +138,7 @@ config = {
|
|||||||
],
|
],
|
||||||
"hidden_size": 512,
|
"hidden_size": 512,
|
||||||
"vocab_size": 4096,
|
"vocab_size": 4096,
|
||||||
|
"n_head": 1,
|
||||||
"max_position_embeddings": 8192, # ?
|
"max_position_embeddings": 8192, # ?
|
||||||
"num_hidden_layers": 12
|
"num_hidden_layers": 12
|
||||||
}
|
}
|
||||||
|
@ -88,6 +88,7 @@ int main(int argc, char ** argv) {
|
|||||||
ctx_ttc = llama_init_ttc.context;
|
ctx_ttc = llama_init_ttc.context;
|
||||||
|
|
||||||
params.model = params.vocoder.model;
|
params.model = params.vocoder.model;
|
||||||
|
params.embedding = true;
|
||||||
|
|
||||||
common_init_result llama_init_cts = common_init_from_params(params);
|
common_init_result llama_init_cts = common_init_from_params(params);
|
||||||
model_cts = llama_init_cts.model;
|
model_cts = llama_init_cts.model;
|
||||||
@ -146,6 +147,9 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_INF("%s: prompt audio size: %d\n", __func__, (int) prompt_inp.size());
|
LOG_INF("%s: prompt audio size: %d\n", __func__, (int) prompt_inp.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (auto & token : prompt_inp) {
|
||||||
|
token -= 151672;
|
||||||
|
}
|
||||||
|
|
||||||
llama_batch batch = llama_batch_init(prompt_inp.size(), 0, 1);
|
llama_batch batch = llama_batch_init(prompt_inp.size(), 0, 1);
|
||||||
|
|
||||||
@ -155,22 +159,27 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size());
|
GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size());
|
||||||
|
|
||||||
if (llama_decode(ctx_ttc, batch) != 0) {
|
if (llama_decode(ctx_cts, batch) != 0) {
|
||||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_synchronize(ctx_ttc);
|
llama_synchronize(ctx_cts);
|
||||||
|
|
||||||
LOG_INF("%s: time for prompt: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
|
LOG_INF("%s: time for prompt: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
|
||||||
|
|
||||||
const float * embd = llama_get_embeddings(ctx_ttc);
|
const float * embd = llama_get_embeddings(ctx_cts);
|
||||||
|
|
||||||
LOG("result:\n");
|
LOG("result:\n");
|
||||||
for (int i = 0; i < 10; ++i) {
|
for (int i = 0; i < 10; ++i) {
|
||||||
LOG("%8.3f ", embd[i]);
|
LOG("%8.3f ", embd[i]);
|
||||||
}
|
}
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
|
double sum = 0.0;
|
||||||
|
for (int i = 0; i < 261*512; ++i) {
|
||||||
|
sum += embd[i];
|
||||||
|
}
|
||||||
|
LOG("sum: %f\n", sum);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
108
src/llama.cpp
108
src/llama.cpp
@ -3614,7 +3614,9 @@ static bool llama_kv_cache_init(
|
|||||||
|
|
||||||
const struct llama_hparams & hparams = model.hparams;
|
const struct llama_hparams & hparams = model.hparams;
|
||||||
|
|
||||||
const int64_t n_layer = hparams.n_layer;
|
const int32_t n_layer = hparams.n_layer;
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
|
||||||
|
|
||||||
cache.has_shift = false;
|
cache.has_shift = false;
|
||||||
|
|
||||||
@ -3655,10 +3657,12 @@ static bool llama_kv_cache_init(
|
|||||||
cache.k_l.reserve(n_layer);
|
cache.k_l.reserve(n_layer);
|
||||||
cache.v_l.reserve(n_layer);
|
cache.v_l.reserve(n_layer);
|
||||||
|
|
||||||
for (int i = 0; i < (int) n_layer; i++) {
|
for (int i = 0; i < n_layer; i++) {
|
||||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
|
||||||
|
|
||||||
ggml_backend_buffer_type_t buft;
|
ggml_backend_buffer_type_t buft;
|
||||||
if (offload) {
|
if (offload) {
|
||||||
auto * dev = model.dev_layer.at(i).dev;
|
auto * dev = model.dev_layer.at(i).dev;
|
||||||
@ -5032,7 +5036,8 @@ struct llama_model_loader {
|
|||||||
|
|
||||||
void done_getting_tensors() const {
|
void done_getting_tensors() const {
|
||||||
if (n_created != n_tensors) {
|
if (n_created != n_tensors) {
|
||||||
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
// TODO: TEMPORARY DISABLED
|
||||||
|
//throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -9422,6 +9427,10 @@ static bool llm_load_tensors(
|
|||||||
case LLM_ARCH_OUTETTS_VOC:
|
case LLM_ARCH_OUTETTS_VOC:
|
||||||
{
|
{
|
||||||
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
// output
|
||||||
|
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
|
||||||
|
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("unknown architecture");
|
throw std::runtime_error("unknown architecture");
|
||||||
@ -16991,6 +17000,30 @@ struct llm_build_context {
|
|||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_outetts_voc() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
//cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
// model.output_norm, NULL,
|
||||||
|
// LLM_NORM_RMS, cb, -1);
|
||||||
|
//cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
//// lm_head
|
||||||
|
//cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||||
|
//cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||||
@ -17266,13 +17299,18 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
{
|
{
|
||||||
result = llm.build_chameleon();
|
result = llm.build_chameleon();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_OUTETTS_VOC:
|
||||||
|
{
|
||||||
|
result = llm.build_outetts_voc();
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
// add on pooling layer
|
// add on pooling layer
|
||||||
if (lctx.cparams.embeddings) {
|
if (lctx.cparams.embeddings) {
|
||||||
result = llm.append_pooling(result);
|
// TODO: TEMPORARY DISABLED
|
||||||
|
//result = llm.append_pooling(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
llm.free();
|
llm.free();
|
||||||
@ -17357,30 +17395,35 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||||
GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
//GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
||||||
const int64_t n_tokens = ubatch.n_tokens;
|
|
||||||
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
|
if (!lctx.inp_out_ids) {
|
||||||
int32_t * data = (int32_t *) lctx.inp_out_ids->data;
|
LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__);
|
||||||
|
|
||||||
if (lctx.n_outputs == n_tokens) {
|
|
||||||
for (int i = 0; i < n_tokens; ++i) {
|
|
||||||
data[i] = i;
|
|
||||||
}
|
|
||||||
} else if (ubatch.output) {
|
|
||||||
int32_t n_outputs = 0;
|
|
||||||
for (int i = 0; i < n_tokens; ++i) {
|
|
||||||
if (ubatch.output[i]) {
|
|
||||||
data[n_outputs++] = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// the graph needs to have been passed the correct number of outputs
|
|
||||||
GGML_ASSERT(lctx.n_outputs == n_outputs);
|
|
||||||
} else if (lctx.n_outputs == 1) {
|
|
||||||
// only keep last output
|
|
||||||
data[0] = n_tokens - 1;
|
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(lctx.n_outputs == 0);
|
const int64_t n_tokens = ubatch.n_tokens;
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
|
||||||
|
int32_t * data = (int32_t *) lctx.inp_out_ids->data;
|
||||||
|
|
||||||
|
if (lctx.n_outputs == n_tokens) {
|
||||||
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
|
data[i] = i;
|
||||||
|
}
|
||||||
|
} else if (ubatch.output) {
|
||||||
|
int32_t n_outputs = 0;
|
||||||
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
|
if (ubatch.output[i]) {
|
||||||
|
data[n_outputs++] = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// the graph needs to have been passed the correct number of outputs
|
||||||
|
GGML_ASSERT(lctx.n_outputs == n_outputs);
|
||||||
|
} else if (lctx.n_outputs == 1) {
|
||||||
|
// only keep last output
|
||||||
|
data[0] = n_tokens - 1;
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(lctx.n_outputs == 0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -18029,9 +18072,14 @@ static int llama_decode_internal(
|
|||||||
|
|
||||||
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
||||||
|
|
||||||
|
struct ggml_tensor * res = nullptr;
|
||||||
|
struct ggml_tensor * embd = nullptr;
|
||||||
|
|
||||||
|
// TODO: TEMPORARY DISABLED
|
||||||
|
if (model.arch != LLM_ARCH_OUTETTS_VOC) {
|
||||||
// the output is always the last tensor in the graph
|
// the output is always the last tensor in the graph
|
||||||
struct ggml_tensor * res = ggml_graph_node(gf, -1);
|
res = ggml_graph_node(gf, -1);
|
||||||
struct ggml_tensor * embd = ggml_graph_node(gf, -2);
|
embd = ggml_graph_node(gf, -2);
|
||||||
|
|
||||||
if (lctx.n_outputs == 0) {
|
if (lctx.n_outputs == 0) {
|
||||||
// no output
|
// no output
|
||||||
@ -18051,6 +18099,10 @@ static int llama_decode_internal(
|
|||||||
embd = nullptr; // do not extract embeddings when not needed
|
embd = nullptr; // do not extract embeddings when not needed
|
||||||
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
res = nullptr;
|
||||||
|
embd = ggml_graph_node(gf, -1);
|
||||||
|
}
|
||||||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||||
|
|
||||||
ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
|
ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
|
||||||
|
Loading…
Reference in New Issue
Block a user