mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-27 03:44:35 +00:00
ggml : optimize rope function to avoid call powf in the tight loop (#807)
This commit is contained in:
parent
be87b6ed20
commit
c5d70f5c9e
22
ggml.c
22
ggml.c
@ -7507,19 +7507,20 @@ static void ggml_compute_forward_rope_f32(
|
|||||||
// row index used to determine which thread to use
|
// row index used to determine which thread to use
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
|
const float theta_scale = powf(10000.0, ((float)-2)/n_dims);
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
||||||
const int p = (mode == 0 ? n_past + i2 : i2);
|
const int p = (mode == 0 ? n_past + i2 : i2);
|
||||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||||
if (ir++ < ir0) continue;
|
if (ir++ < ir0) continue;
|
||||||
if (ir > ir1) break;
|
if (ir > ir1) break;
|
||||||
|
float theta = (float)p;
|
||||||
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
||||||
const float theta = powf(10000.0, ((float)-i0)/n_dims);
|
const float cos_theta = cosf(theta);
|
||||||
|
const float sin_theta = sinf(theta);
|
||||||
const float cos_theta = cosf(p*theta);
|
|
||||||
const float sin_theta = sinf(p*theta);
|
|
||||||
|
|
||||||
|
theta *= theta_scale;
|
||||||
const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
@ -7580,19 +7581,20 @@ static void ggml_compute_forward_rope_f16(
|
|||||||
// row index used to determine which thread to use
|
// row index used to determine which thread to use
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
|
const float theta_scale = powf(10000.0, ((float)-2)/n_dims);
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
||||||
const int p = (mode == 0 ? n_past + i2 : i2);
|
const int p = (mode == 0 ? n_past + i2 : i2);
|
||||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||||
if (ir++ < ir0) continue;
|
if (ir++ < ir0) continue;
|
||||||
if (ir > ir1) break;
|
if (ir > ir1) break;
|
||||||
|
float theta = (float)p;
|
||||||
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
for (int i0 = 0; i0 < n_dims; i0 += 2) {
|
||||||
const float theta = powf(10000.0, ((float)-i0)/n_dims);
|
const float cos_theta = cosf(theta);
|
||||||
|
const float sin_theta = sinf(theta);
|
||||||
const float cos_theta = cosf(p*theta);
|
|
||||||
const float sin_theta = sinf(p*theta);
|
|
||||||
|
|
||||||
|
theta *= theta_scale;
|
||||||
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user