mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 10:54:36 +00:00
llama : simplify Mamba with advanced batch splits (#8526)
* llama : advanced batch splits This includes equal-sequence-length batch splits which are useful to simplify recurrent model operators. * llama : always make recurrent state slots contiguous * ggml : simplify mamba operators * llama : fix integer signedness mixing * llama : logits_all has priority over batch->logits Otherwise, the server embeddings tests failed. This was likely an existing problem but was only detected here because of an additional assertion. * llama : apply suggestions Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * llama : fix t5 segfault * llama : fix Mamba session save and restore * llama : minor cosmetic changes * llama : rename llama_reorder_outputs to llama_output_reorder Also move it closer to llama_output_reserve. * llama : fix pooled embeddings when using batches with equal_seqs * minor : add struct members for clarity ggml-ci * llama : fix T5 segfault again * llama : fix Mamba pooled embeddings with multiple sequences Until the pooled embeddings are refactored to allow splitting across ubatches for causal embeddings, recurrent models can only process a single sequence per ubatch when calculating pooled embeddings. * llama : add llama_model_is_recurrent to simplify figuring that out This will make it easier to more cleanly support RWKV-v6 and Mamba-2. * llama : fix simple splits when the batch contains embeddings --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
fc54ef0d1c
commit
a1631e53f6
@ -1777,10 +1777,8 @@ extern "C" {
|
|||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * s,
|
struct ggml_tensor * sx,
|
||||||
struct ggml_tensor * x,
|
struct ggml_tensor * c);
|
||||||
struct ggml_tensor * c,
|
|
||||||
struct ggml_tensor * sq);
|
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_ssm_scan(
|
GGML_API struct ggml_tensor * ggml_ssm_scan(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
@ -1789,8 +1787,7 @@ extern "C" {
|
|||||||
struct ggml_tensor * dt,
|
struct ggml_tensor * dt,
|
||||||
struct ggml_tensor * A,
|
struct ggml_tensor * A,
|
||||||
struct ggml_tensor * B,
|
struct ggml_tensor * B,
|
||||||
struct ggml_tensor * C,
|
struct ggml_tensor * C);
|
||||||
struct ggml_tensor * sq);
|
|
||||||
|
|
||||||
// partition into non-overlapping windows with padding if needed
|
// partition into non-overlapping windows with padding if needed
|
||||||
// example:
|
// example:
|
||||||
|
273
ggml/src/ggml.c
273
ggml/src/ggml.c
@ -7229,43 +7229,34 @@ struct ggml_tensor * ggml_flash_attn_back(
|
|||||||
|
|
||||||
struct ggml_tensor * ggml_ssm_conv(
|
struct ggml_tensor * ggml_ssm_conv(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * s,
|
struct ggml_tensor * sx,
|
||||||
struct ggml_tensor * x,
|
struct ggml_tensor * c) {
|
||||||
struct ggml_tensor * c,
|
GGML_ASSERT(ggml_is_3d(sx));
|
||||||
struct ggml_tensor * sq) {
|
|
||||||
GGML_ASSERT(ggml_is_3d(s));
|
|
||||||
GGML_ASSERT(ggml_is_matrix(x));
|
|
||||||
GGML_ASSERT(ggml_is_matrix(c));
|
GGML_ASSERT(ggml_is_matrix(c));
|
||||||
GGML_ASSERT(ggml_is_matrix(sq));
|
|
||||||
GGML_ASSERT(sq->type == GGML_TYPE_I32);
|
|
||||||
|
|
||||||
const int64_t d_conv = c->ne[0];
|
const int64_t d_conv = c->ne[0];
|
||||||
const int64_t d_inner = c->ne[1];
|
const int64_t d_inner = c->ne[1];
|
||||||
const int64_t n_tokens = x->ne[1];
|
const int64_t n_t = sx->ne[0] - d_conv + 1; // tokens per sequence
|
||||||
const int64_t n_kv = s->ne[2];
|
const int64_t n_s = sx->ne[2];
|
||||||
|
|
||||||
GGML_ASSERT( s->ne[0] == d_conv - 1);
|
// TODO: maybe support other strides than 1?
|
||||||
GGML_ASSERT( s->ne[1] == d_inner);
|
GGML_ASSERT(sx->ne[0] == d_conv - 1 + n_t);
|
||||||
GGML_ASSERT( x->ne[0] == d_inner);
|
GGML_ASSERT(sx->ne[1] == d_inner);
|
||||||
GGML_ASSERT(sq->ne[0] == n_kv);
|
GGML_ASSERT(n_t >= 0);
|
||||||
GGML_ASSERT(sq->ne[1] == n_tokens);
|
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
if (s->grad || x->grad || c->grad || sq->grad) {
|
if (sx->grad || c->grad) {
|
||||||
GGML_ABORT("fatal error"); // TODO: implement
|
GGML_ABORT("fatal error"); // TODO: implement
|
||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2-in-1 concatenated x and conv_states, {d_inner, n_tokens} with {d_conv, d_inner, n_kv}
|
struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_t, n_s);
|
||||||
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (d_inner*n_tokens) + (d_conv*d_inner*n_kv));
|
|
||||||
|
|
||||||
result->op = GGML_OP_SSM_CONV;
|
result->op = GGML_OP_SSM_CONV;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
result->src[0] = s;
|
result->src[0] = sx;
|
||||||
result->src[1] = x;
|
result->src[1] = c;
|
||||||
result->src[2] = c;
|
|
||||||
result->src[3] = sq;
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -7279,39 +7270,42 @@ struct ggml_tensor * ggml_ssm_scan(
|
|||||||
struct ggml_tensor * dt,
|
struct ggml_tensor * dt,
|
||||||
struct ggml_tensor * A,
|
struct ggml_tensor * A,
|
||||||
struct ggml_tensor * B,
|
struct ggml_tensor * B,
|
||||||
struct ggml_tensor * C,
|
struct ggml_tensor * C) {
|
||||||
struct ggml_tensor * sq) {
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(s));
|
GGML_ASSERT(ggml_is_contiguous(s));
|
||||||
GGML_ASSERT(ggml_is_contiguous(x));
|
GGML_ASSERT(ggml_is_contiguous(x));
|
||||||
GGML_ASSERT(ggml_is_contiguous(dt));
|
GGML_ASSERT(ggml_is_contiguous(dt));
|
||||||
GGML_ASSERT(ggml_is_contiguous(A));
|
GGML_ASSERT(ggml_is_contiguous(A));
|
||||||
GGML_ASSERT(sq->type == GGML_TYPE_I32);
|
GGML_ASSERT(ggml_is_matrix(A));
|
||||||
|
GGML_ASSERT(ggml_is_3d(B));
|
||||||
|
GGML_ASSERT(ggml_is_3d(s));
|
||||||
GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
|
GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
|
||||||
GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
|
GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
|
||||||
GGML_ASSERT(ggml_are_same_shape(x, dt));
|
GGML_ASSERT(ggml_are_same_shape(x, dt));
|
||||||
|
GGML_ASSERT(ggml_are_same_shape(B, C));
|
||||||
|
|
||||||
{
|
{
|
||||||
const int64_t d_state = s->ne[0];
|
const int64_t d_state = s->ne[0];
|
||||||
const int64_t d_inner = s->ne[1];
|
const int64_t d_inner = s->ne[1];
|
||||||
const int64_t n_tokens = x->ne[1];
|
const int64_t n_seq_tokens = x->ne[1];
|
||||||
|
const int64_t n_seqs = x->ne[2];
|
||||||
|
|
||||||
|
GGML_ASSERT(s->ne[2] == n_seqs);
|
||||||
GGML_ASSERT(x->ne[0] == d_inner);
|
GGML_ASSERT(x->ne[0] == d_inner);
|
||||||
GGML_ASSERT(A->ne[0] == d_state);
|
GGML_ASSERT(A->ne[0] == d_state);
|
||||||
GGML_ASSERT(A->ne[1] == d_inner);
|
GGML_ASSERT(A->ne[1] == d_inner);
|
||||||
GGML_ASSERT(B->ne[0] == d_state);
|
GGML_ASSERT(B->ne[0] == d_state);
|
||||||
GGML_ASSERT(B->ne[1] == n_tokens);
|
GGML_ASSERT(B->ne[1] == n_seq_tokens);
|
||||||
GGML_ASSERT(C->ne[0] == d_state);
|
GGML_ASSERT(B->ne[2] == n_seqs);
|
||||||
GGML_ASSERT(C->ne[1] == n_tokens);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) {
|
if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad) {
|
||||||
GGML_ABORT("fatal error"); // TODO: implement
|
GGML_ABORT("fatal error"); // TODO: implement
|
||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2-in-1 concatenated y and ssm_states, {d_inner, n_tokens} with {d_state, d_inner, n_kv}
|
// concatenated y + ssm_states
|
||||||
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s));
|
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s));
|
||||||
|
|
||||||
result->op = GGML_OP_SSM_SCAN;
|
result->op = GGML_OP_SSM_SCAN;
|
||||||
@ -7322,7 +7316,6 @@ struct ggml_tensor * ggml_ssm_scan(
|
|||||||
result->src[3] = A;
|
result->src[3] = A;
|
||||||
result->src[4] = B;
|
result->src[4] = B;
|
||||||
result->src[5] = C;
|
result->src[5] = C;
|
||||||
result->src[6] = sq;
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -10995,11 +10988,6 @@ static void ggml_compute_forward_concat_f32(
|
|||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
|
||||||
// TODO: support for transposed / permuted tensors
|
|
||||||
GGML_ASSERT(nb0 == sizeof(float));
|
|
||||||
GGML_ASSERT(nb00 == sizeof(float));
|
|
||||||
GGML_ASSERT(nb10 == sizeof(float));
|
|
||||||
|
|
||||||
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
||||||
|
|
||||||
GGML_ASSERT(dim >= 0 && dim < 4);
|
GGML_ASSERT(dim >= 0 && dim < 4);
|
||||||
@ -15782,27 +15770,22 @@ static void ggml_compute_forward_flash_attn_back(
|
|||||||
static void ggml_compute_forward_ssm_conv_f32(
|
static void ggml_compute_forward_ssm_conv_f32(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
const struct ggml_tensor * src0 = dst->src[0]; // conv_state
|
const struct ggml_tensor * src0 = dst->src[0]; // conv_x
|
||||||
const struct ggml_tensor * src1 = dst->src[1]; // x
|
const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight
|
||||||
const struct ggml_tensor * src2 = dst->src[2]; // conv1d.weight
|
|
||||||
const struct ggml_tensor * src3 = dst->src[3]; // state_seq
|
|
||||||
|
|
||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
const int nc = src2->ne[0]; // d_conv
|
const int nc = src1->ne[0]; // d_conv
|
||||||
const int nr = src0->ne[1]; // d_inner
|
const int ncs = src0->ne[0]; // d_conv - 1 + n_t
|
||||||
const int n_t = src1->ne[1]; // n_tokens
|
const int nr = src0->ne[1]; // d_inner
|
||||||
const int n_kv = src0->ne[2]; // max number of sequences in the batch
|
const int n_t = dst->ne[1]; // tokens per sequence
|
||||||
|
const int n_s = dst->ne[2]; // number of sequences in the batch
|
||||||
|
|
||||||
GGML_ASSERT((nr*n_t) + (nc*nr*n_kv) == ggml_nelements(dst));
|
GGML_ASSERT( dst->ne[0] == nr);
|
||||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||||
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
||||||
GGML_ASSERT(src2->nb[0] == sizeof(float));
|
|
||||||
GGML_ASSERT(src3->nb[0] == sizeof(int32_t));
|
|
||||||
GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
|
GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
|
||||||
// for use with the destination state offset between sequences
|
|
||||||
GGML_ASSERT(src2->nb[2] == src2->ne[1]*src2->ne[0]*sizeof(float));
|
|
||||||
|
|
||||||
// rows per thread
|
// rows per thread
|
||||||
const int dr = (nr + nth - 1)/nth;
|
const int dr = (nr + nth - 1)/nth;
|
||||||
@ -15812,76 +15795,29 @@ static void ggml_compute_forward_ssm_conv_f32(
|
|||||||
const int ir1 = MIN(ir0 + dr, nr);
|
const int ir1 = MIN(ir0 + dr, nr);
|
||||||
const int ir = ir1 - ir0;
|
const int ir = ir1 - ir0;
|
||||||
|
|
||||||
if (n_kv > 1) {
|
for (int i3 = 0; i3 < n_s; ++i3) {
|
||||||
// multiple sequences means it's hard to know when it's the first time a state is read,
|
for (int i2 = 0; i2 < n_t; ++i2) {
|
||||||
// so copy them all over to the destination, just to be sure.
|
// {d_conv - 1 + n_t, d_inner, n_seqs}
|
||||||
for (int i3 = 0; i3 < n_kv; ++i3) {
|
// sliding window
|
||||||
float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
|
const float * s = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i2*(src0->nb[0]) + i3*(src0->nb[2])); // {d_conv, d_inner, n_s}
|
||||||
float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + i3*(src2->nb[2]) + nr*n_t*sizeof(float));
|
const float * c = (const float *) ((const char *) src1->data + ir0*(src1->nb[1])); // {d_conv, d_inner}
|
||||||
// can't use memcpy because of d_conv vs d_conv - 1
|
float * x = (float *) ((char *) dst->data + ir0*(dst->nb[0]) + i2*(dst->nb[1]) + i3*(dst->nb[2])); // {d_inner, n_t, n_s}
|
||||||
|
|
||||||
|
// TODO: transpose the output for smaller strides for big batches?
|
||||||
|
// d_inner
|
||||||
for (int i1 = 0; i1 < ir; ++i1) {
|
for (int i1 = 0; i1 < ir; ++i1) {
|
||||||
for (int i0 = 0; i0 < nc - 1; ++i0) {
|
// rowwise dot product
|
||||||
// copy s0 to last (d_conv - 1) columns of s
|
// NOTE: not using ggml_vec_dot_f32, because its sum is in double precision
|
||||||
s[1 + i0 + i1*nc] = s0[i0 + i1*(nc - 1)];
|
float sumf = 0.0f;
|
||||||
|
|
||||||
|
// d_conv
|
||||||
|
for (int i0 = 0; i0 < nc; ++i0) {
|
||||||
|
sumf += s[i0 + i1*ncs] * c[i0 + i1*nc];
|
||||||
}
|
}
|
||||||
|
x[i1] = sumf;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i2 = 0; i2 < n_t; ++i2) {
|
|
||||||
int32_t * sq = (int32_t *) ((char *) src3->data + i2*(src3->nb[1])); // {n_kv, n_tokens}
|
|
||||||
float * x = (float *) ((char *) dst->data + ir0*sizeof(float) + i2*(nr*sizeof(float))); // {d_inner, n_tokens}
|
|
||||||
float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + sq[0]*(src2->nb[2]) + nr*n_t*sizeof(float)); // {d_conv, d_inner, n_kv}
|
|
||||||
float * s0; // {d_conv - 1, d_inner, n_kv}
|
|
||||||
float * x0 = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
|
|
||||||
float * c = (float *) ((char *) src2->data + ir0*(src2->nb[1])); // {d_conv, d_inner}
|
|
||||||
int ne0s0;
|
|
||||||
|
|
||||||
GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
|
|
||||||
|
|
||||||
// avoid needing to copy the state for the first token
|
|
||||||
if (i2 == 0) {
|
|
||||||
s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_conv - 1, d_inner, n_kv}
|
|
||||||
ne0s0 = src0->ne[0];
|
|
||||||
} else {
|
|
||||||
// the source is the last (d_conv - 1) columns of the destination
|
|
||||||
s0 = s + 1;
|
|
||||||
ne0s0 = nc;
|
|
||||||
}
|
|
||||||
|
|
||||||
// d_inner
|
|
||||||
for (int i1 = 0; i1 < ir; ++i1) {
|
|
||||||
// shift state left
|
|
||||||
for (int i0 = 0; i0 < nc - 1; ++i0) {
|
|
||||||
s[i0 + i1*nc] = s0[i0 + i1*ne0s0];
|
|
||||||
}
|
|
||||||
// insert x on the last column
|
|
||||||
s[(nc - 1) + i1*nc] = x0[i1];
|
|
||||||
}
|
|
||||||
|
|
||||||
// handle copies when there are multiple output states
|
|
||||||
for (int i3 = 1; i3 < n_kv; ++i3) {
|
|
||||||
int32_t seq = sq[i3];
|
|
||||||
if (0 <= seq && seq < n_kv) {
|
|
||||||
float * s1 = s + (seq - sq[0])*nc*nr;
|
|
||||||
memcpy(s1, s, nc*ir*sizeof(float));
|
|
||||||
} else {
|
|
||||||
// stop at negative or too big seq_ids
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// it seems a little faster when this is separate from the state shift
|
|
||||||
for (int i1 = 0; i1 < ir; ++i1) {
|
|
||||||
// rowwise dot product
|
|
||||||
float sumf = 0.0f;
|
|
||||||
for (int i0 = 0; i0 < nc; ++i0) {
|
|
||||||
int i = i0 + i1*nc;
|
|
||||||
sumf += s[i] * c[i];
|
|
||||||
}
|
|
||||||
x[i1] = sumf;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_compute_forward_ssm_conv(
|
static void ggml_compute_forward_ssm_conv(
|
||||||
@ -15910,15 +15846,14 @@ static void ggml_compute_forward_ssm_scan_f32(
|
|||||||
const struct ggml_tensor * src3 = dst->src[3]; // A
|
const struct ggml_tensor * src3 = dst->src[3]; // A
|
||||||
const struct ggml_tensor * src4 = dst->src[4]; // B
|
const struct ggml_tensor * src4 = dst->src[4]; // B
|
||||||
const struct ggml_tensor * src5 = dst->src[5]; // C
|
const struct ggml_tensor * src5 = dst->src[5]; // C
|
||||||
const struct ggml_tensor * src6 = dst->src[6]; // sq
|
|
||||||
|
|
||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
const int64_t nc = src0->ne[0]; // d_state
|
const int64_t nc = src0->ne[0]; // d_state
|
||||||
const int64_t nr = src0->ne[1]; // d_inner
|
const int64_t nr = src0->ne[1]; // d_inner
|
||||||
const int64_t n_t = src1->ne[1]; // number of tokens in the batch
|
const int64_t n_t = src1->ne[1]; // number of tokens per sequence
|
||||||
const int64_t n_kv = src0->ne[2]; // max number of sequences in the batch
|
const int64_t n_s = src0->ne[2]; // number of sequences in the batch
|
||||||
|
|
||||||
GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
|
GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
|
||||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||||
@ -15927,12 +15862,12 @@ static void ggml_compute_forward_ssm_scan_f32(
|
|||||||
GGML_ASSERT(src3->nb[0] == sizeof(float));
|
GGML_ASSERT(src3->nb[0] == sizeof(float));
|
||||||
GGML_ASSERT(src4->nb[0] == sizeof(float));
|
GGML_ASSERT(src4->nb[0] == sizeof(float));
|
||||||
GGML_ASSERT(src5->nb[0] == sizeof(float));
|
GGML_ASSERT(src5->nb[0] == sizeof(float));
|
||||||
// required for the dot product between s and C, and when copying the states
|
// required for the dot product between s and C
|
||||||
GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
|
GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
|
||||||
// required for per-sequence offsets for states
|
// required for per-sequence offsets for states
|
||||||
GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
|
GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
|
||||||
// required to get correct offset for state destination (i.e. src1->nb[2])
|
// required to get correct offset for state destination (i.e. src1->nb[3])
|
||||||
GGML_ASSERT(src1->nb[2] == src1->ne[0]*src1->ne[1]*sizeof(float));
|
GGML_ASSERT(src1->nb[3] == src1->ne[0]*src1->ne[1]*src1->ne[2]*sizeof(float));
|
||||||
|
|
||||||
// rows per thread
|
// rows per thread
|
||||||
const int dr = (nr + nth - 1)/nth;
|
const int dr = (nr + nth - 1)/nth;
|
||||||
@ -15942,64 +15877,36 @@ static void ggml_compute_forward_ssm_scan_f32(
|
|||||||
const int ir1 = MIN(ir0 + dr, nr);
|
const int ir1 = MIN(ir0 + dr, nr);
|
||||||
const int ir = ir1 - ir0;
|
const int ir = ir1 - ir0;
|
||||||
|
|
||||||
if (n_kv > 1) {
|
for (int i3 = 0; i3 < n_s; ++i3) {
|
||||||
// it's hard to know if the source states have already been copied
|
for (int i2 = 0; i2 < n_t; ++i2) {
|
||||||
// when there are multiple, so copy them already.
|
const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
|
||||||
for (int i3 = 0; i3 < n_kv; ++i3) {
|
const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
|
||||||
float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
|
const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
|
||||||
float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[2]);
|
const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
|
||||||
memcpy(s, s0, nc*ir*sizeof(float));
|
const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
|
||||||
}
|
const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
|
||||||
}
|
float * y = (float *) ((char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
|
||||||
|
float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s}
|
||||||
|
|
||||||
for (int i2 = 0; i2 < n_t; ++i2) {
|
// use the output as the source for the next token-wise iterations
|
||||||
int32_t * sq = (int32_t *) ((char *) src6->data + i2*(src6->nb[1])); // {n_kv, n_tokens}
|
if (i2 > 0) { s0 = s; }
|
||||||
float * y = (float *) ((char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
|
|
||||||
float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2]) + src1->nb[2]); // {d_state, d_inner, n_kv}
|
|
||||||
float * s0;
|
|
||||||
float * x = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
|
|
||||||
float * dt = (float *) ((char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1])); // {d_inner, n_tokens}
|
|
||||||
float * A = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
|
|
||||||
float * B = (float *) ((char *) src4->data + i2*(src4->nb[1])); // {d_state, n_tokens}
|
|
||||||
float * C = (float *) ((char *) src5->data + i2*(src5->nb[1])); // {d_state, n_tokens}
|
|
||||||
|
|
||||||
GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
|
// d_inner
|
||||||
|
for (int i1 = 0; i1 < ir; ++i1) {
|
||||||
// avoid needing to copy the state for the first token
|
// ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
|
||||||
if (i2 == 0) {
|
float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
|
||||||
s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_state, d_inner, n_kv}
|
float x_dt = x[i1] * dt_soft_plus;
|
||||||
} else {
|
float sumf = 0.0f;
|
||||||
// otherwise the source is the same as the destination
|
// d_state
|
||||||
s0 = s;
|
for (int i0 = 0; i0 < nc; ++i0) {
|
||||||
}
|
int i = i0 + i1*nc;
|
||||||
|
// state = prev_state * dA + dB * x
|
||||||
// d_inner
|
float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
|
||||||
for (int i1 = 0; i1 < ir; ++i1) {
|
// y = rowwise_dotprod(state, C)
|
||||||
// ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
|
sumf += state * C[i0];
|
||||||
float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
|
s[i] = state;
|
||||||
float x_dt = x[i1] * dt_soft_plus;
|
}
|
||||||
float sumf = 0.0f;
|
y[i1] = sumf;
|
||||||
// d_state
|
|
||||||
for (int i0 = 0; i0 < nc; ++i0) {
|
|
||||||
int i = i0 + i1*nc;
|
|
||||||
// state = prev_state * dA + dB * x
|
|
||||||
float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
|
|
||||||
// y = rowwise_dotprod(state, C)
|
|
||||||
sumf += state * C[i0];
|
|
||||||
s[i] = state;
|
|
||||||
}
|
|
||||||
y[i1] = sumf;
|
|
||||||
}
|
|
||||||
|
|
||||||
// handle copies when there are multiple output states
|
|
||||||
for (int i3 = 1; i3 < n_kv; ++i3) {
|
|
||||||
int32_t seq = sq[i3];
|
|
||||||
if (0 <= seq && seq < n_kv) {
|
|
||||||
float * s1 = s + (seq - sq[0])*nc*nr;
|
|
||||||
memcpy(s1, s, nc*ir*sizeof(float));
|
|
||||||
} else {
|
|
||||||
// stop at negative or too big seq_ids
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -511,6 +511,9 @@ extern "C" {
|
|||||||
// to the decoder to start generating output sequence. For other models, it returns -1.
|
// to the decoder to start generating output sequence. For other models, it returns -1.
|
||||||
LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
|
LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
||||||
|
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
||||||
|
|
||||||
// Returns 0 on success
|
// Returns 0 on success
|
||||||
LLAMA_API uint32_t llama_model_quantize(
|
LLAMA_API uint32_t llama_model_quantize(
|
||||||
const char * fname_inp,
|
const char * fname_inp,
|
||||||
|
1524
src/llama.cpp
1524
src/llama.cpp
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user