mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-27 03:44:35 +00:00
ggml : fix NeoX rope to rotate just first n_dims
This commit is contained in:
parent
0644c3be51
commit
f703ca8a3c
46
ggml-cuda.cu
46
ggml-cuda.cu
@ -4998,31 +4998,29 @@ static __global__ void rope_neox(
|
|||||||
const int ib = col / n_dims;
|
const int ib = col / n_dims;
|
||||||
const int ic = col % n_dims;
|
const int ic = col % n_dims;
|
||||||
|
|
||||||
// IMPORTANT: consider the case ncols == 80 and n_dims == 32 (phi-2)
|
if (ib == 0) {
|
||||||
// I don't know what we are supposed to compute, because the row is not divisible by n_dims
|
const int i = row*ncols + ib*n_dims + ic/2;
|
||||||
// this check matches the CPU code, but it is likely wrong as well
|
const int i2 = row/p_delta_rows;
|
||||||
// I can't understand the Python code, so if you know what to do here, please fix it
|
|
||||||
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
float cur_rot = inv_ndims * ic - ib;
|
||||||
if (ncols % n_dims != 0 && ib == ncols/n_dims) {
|
|
||||||
return;
|
const int p = has_pos ? pos[i2] : 0;
|
||||||
|
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
|
||||||
|
|
||||||
|
float cos_theta, sin_theta;
|
||||||
|
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
|
const float x0 = x[i + 0];
|
||||||
|
const float x1 = x[i + n_dims/2];
|
||||||
|
|
||||||
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||||
|
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||||
|
} else {
|
||||||
|
const int i = row*ncols + ib*n_dims + ic;
|
||||||
|
|
||||||
|
dst[i + 0] = x[i + 0];
|
||||||
|
dst[i + 1] = x[i + 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
const int i = row*ncols + ib*n_dims + ic/2;
|
|
||||||
const int i2 = row/p_delta_rows;
|
|
||||||
|
|
||||||
float cur_rot = inv_ndims * ic - ib;
|
|
||||||
|
|
||||||
const int p = has_pos ? pos[i2] : 0;
|
|
||||||
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
|
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
|
||||||
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
|
||||||
|
|
||||||
const float x0 = x[i + 0];
|
|
||||||
const float x1 = x[i + n_dims/2];
|
|
||||||
|
|
||||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
|
||||||
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void rope_glm_f32(
|
static __global__ void rope_glm_f32(
|
||||||
|
@ -1702,8 +1702,9 @@ kernel void kernel_rope(
|
|||||||
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
|
||||||
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
|
if (ic < n_dims) {
|
||||||
|
const int64_t ib = 0;
|
||||||
|
|
||||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||||
const float cur_rot = inv_ndims*ic - ib;
|
const float cur_rot = inv_ndims*ic - ib;
|
||||||
@ -1722,6 +1723,14 @@ kernel void kernel_rope(
|
|||||||
|
|
||||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||||
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||||
|
} else {
|
||||||
|
const int64_t i0 = ic;
|
||||||
|
|
||||||
|
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
|
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
dst_data[0] = src[0];
|
||||||
|
dst_data[1] = src[1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
34
ggml.c
34
ggml.c
@ -11408,10 +11408,13 @@ static void ggml_compute_forward_rope_f32(
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
||||||
|
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
||||||
theta_base *= freq_scale;
|
theta_base *= freq_scale;
|
||||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
||||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
if (ic < n_dims) {
|
||||||
|
const int64_t ib = 0;
|
||||||
|
|
||||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||||
float cur_rot = inv_ndims * ic - ib;
|
float cur_rot = inv_ndims * ic - ib;
|
||||||
|
|
||||||
@ -11434,6 +11437,14 @@ static void ggml_compute_forward_rope_f32(
|
|||||||
|
|
||||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||||
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||||
|
} else {
|
||||||
|
const int64_t i0 = ic;
|
||||||
|
|
||||||
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
dst_data[0] = src[0];
|
||||||
|
dst_data[1] = src[1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -11561,10 +11572,13 @@ static void ggml_compute_forward_rope_f16(
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
||||||
|
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
||||||
theta_base *= freq_scale;
|
theta_base *= freq_scale;
|
||||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
||||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
if (ic < n_dims) {
|
||||||
|
const int64_t ib = 0;
|
||||||
|
|
||||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||||
float cur_rot = inv_ndims * ic - ib;
|
float cur_rot = inv_ndims * ic - ib;
|
||||||
|
|
||||||
@ -11587,6 +11601,14 @@ static void ggml_compute_forward_rope_f16(
|
|||||||
|
|
||||||
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
||||||
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||||
|
} else {
|
||||||
|
const int64_t i0 = ic;
|
||||||
|
|
||||||
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
dst_data[0] = src[0];
|
||||||
|
dst_data[1] = src[1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user