first conv

This commit is contained in:
Georgi Gerganov 2024-12-10 19:18:04 +02:00
parent e08f8a5d8c
commit ce49e6a2cd
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
3 changed files with 40 additions and 10 deletions

View File

@ -170,13 +170,19 @@ int main(int argc, char ** argv) {
const float * embd = llama_get_embeddings(ctx_cts); const float * embd = llama_get_embeddings(ctx_cts);
int n = 768*261;
LOG("result:\n"); LOG("result:\n");
for (int i = 0; i < 10; ++i) { for (int i = 0; i < 10; ++i) {
LOG("%8.3f ", embd[i]); LOG("%8.3f ", embd[i]);
} }
LOG("\n"); LOG("\n");
for (int i = n - 10; i < n; ++i) {
LOG("%8.3f ", embd[i]);
}
LOG("\n");
double sum = 0.0; double sum = 0.0;
for (int i = 0; i < 261*512; ++i) { for (int i = 0; i < n; ++i) {
sum += embd[i]; sum += embd[i];
} }
LOG("sum: %f\n", sum); LOG("sum: %f\n", sum);

View File

@ -3874,7 +3874,7 @@ struct ggml_tensor * ggml_im2col(
int d1, int d1,
bool is_2D, bool is_2D,
enum ggml_type dst_type) { enum ggml_type dst_type) {
if(is_2D) { if (is_2D) {
GGML_ASSERT(a->ne[2] == b->ne[2]); GGML_ASSERT(a->ne[2] == b->ne[2]);
} else { } else {
GGML_ASSERT(a->ne[1] == b->ne[1]); GGML_ASSERT(a->ne[1] == b->ne[1]);

View File

@ -3054,8 +3054,8 @@ struct llama_model {
struct ggml_tensor * cls_out = nullptr; struct ggml_tensor * cls_out = nullptr;
struct ggml_tensor * cls_out_b = nullptr; struct ggml_tensor * cls_out_b = nullptr;
// quantizer struct ggml_tensor * conv_1d = nullptr;
struct ggml_tensor * qntz_cbook_embd = nullptr; struct ggml_tensor * conv_1d_b = nullptr;
std::vector<llama_layer> layers; std::vector<llama_layer> layers;
@ -5036,7 +5036,7 @@ struct llama_model_loader {
void done_getting_tensors() const { void done_getting_tensors() const {
if (n_created != n_tensors) { if (n_created != n_tensors) {
// TODO: TEMPORARY DISABLED // TODO: TEMPORARY DISABLED [OUTETTS]
//throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); //throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
} }
} }
@ -7356,6 +7356,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
// this tensor is loaded for T5, but never used // this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
}; };
// checks if the weight tensor can be used with the specified buffer type and device // checks if the weight tensor can be used with the specified buffer type and device
@ -7460,6 +7461,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H); ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state); op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
} break; } break;
case GGML_OP_IM2COL:
{
int n_embd = hparams.n_embd;
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
} break;
default: default:
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name); GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
} }
@ -9428,6 +9435,9 @@ static bool llm_load_tensors(
{ {
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd, 768}, 0);
model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {768}, 0);
// output // output
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0); model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED); model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
@ -9671,7 +9681,7 @@ static struct ggml_tensor * llm_build_inp_embd(
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
} else { } else {
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens); lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
inpL = lctx.inp_embd; inpL = lctx.inp_embd;
ggml_set_input(lctx.inp_embd); ggml_set_input(lctx.inp_embd);
} }
@ -17009,7 +17019,13 @@ struct llm_build_context {
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
cur = inpL; cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
printf("conv1d: %d %d %d\n", model.conv_1d->ne[0], model.conv_1d->ne[1], model.conv_1d->ne[2]);
cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1);
cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.conv_1d_b, 1, model.conv_1d_b->ne[0]));
printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
//cur = llm_build_norm(ctx0, cur, hparams, //cur = llm_build_norm(ctx0, cur, hparams,
// model.output_norm, NULL, // model.output_norm, NULL,
@ -17309,7 +17325,7 @@ static struct ggml_cgraph * llama_build_graph(
// add on pooling layer // add on pooling layer
if (lctx.cparams.embeddings) { if (lctx.cparams.embeddings) {
// TODO: TEMPORARY DISABLED // TODO: TEMPORARY DISABLED [OUTETTS]
//result = llm.append_pooling(result); //result = llm.append_pooling(result);
} }
@ -17798,7 +17814,13 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
} }
const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0; const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
// TODO: TEMPORARY !!! [OUTETTS]
#if 0
const size_t new_size = (logits_size + embd_size) * sizeof(float); const size_t new_size = (logits_size + embd_size) * sizeof(float);
#else
const size_t new_size = 1024*1024*32;
#endif
// alloc only when more than the current capacity is required // alloc only when more than the current capacity is required
// TODO: also consider shrinking the buffer // TODO: also consider shrinking the buffer
@ -18075,7 +18097,7 @@ static int llama_decode_internal(
struct ggml_tensor * res = nullptr; struct ggml_tensor * res = nullptr;
struct ggml_tensor * embd = nullptr; struct ggml_tensor * embd = nullptr;
// TODO: TEMPORARY DISABLED // TODO: TEMPORARY DISABLED [OUTETTS]
if (model.arch != LLM_ARCH_OUTETTS_VOC) { if (model.arch != LLM_ARCH_OUTETTS_VOC) {
// the output is always the last tensor in the graph // the output is always the last tensor in the graph
res = ggml_graph_node(gf, -1); res = ggml_graph_node(gf, -1);
@ -18170,7 +18192,9 @@ if (model.arch != LLM_ARCH_OUTETTS_VOC) {
if (n_outputs_new) { if (n_outputs_new) {
GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs); GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size); GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); // TODO: TEMPORARY [OUTETTS]
//ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*768*sizeof(float));
} }
} break; } break;
case LLAMA_POOLING_TYPE_MEAN: case LLAMA_POOLING_TYPE_MEAN: